Skip to content

Commit 609ed97

Browse files
committed
GH-148937: fix for free-threaded GC (RSS based defer)
Asking the OS for the process memory usage doesn't work will given how mimalloc works. It does not promptly return memory to the OS and so the memory doesn't drop after cyclic trash is freed. Instead of asking the OS, use mimalloc APIs to compute how much memory is being used by all mimalloc arenas. We need to stop-the-world to do this but usually we can avoid doing a collection. So, from a performance perspective, this is worth it.
1 parent 448d7b9 commit 609ed97

2 files changed

Lines changed: 82 additions & 200 deletions

File tree

Include/internal/pycore_interp_structs.h

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -281,15 +281,14 @@ struct _gc_runtime_state {
281281
/* True if gc.freeze() has been used. */
282282
int freeze_active;
283283

284-
/* Memory usage of the process (RSS + swap) after last GC. */
285-
Py_ssize_t last_mem;
284+
/* Sum of area->used*area->block_size across all mimalloc heaps after last
285+
GC, in KB. Updated under stop-the-world so the measurement is accurate
286+
even when OS pages are being reused. */
287+
Py_ssize_t last_gc_used;
286288

287289
/* This accumulates the new object count whenever collection is deferred
288-
due to the RSS increase condition not being meet. Reset on collection. */
290+
due to memory usage not increasing enough. Reset on collection. */
289291
Py_ssize_t deferred_count;
290-
291-
/* Mutex held for gc_should_collect_mem_usage(). */
292-
PyMutex mutex;
293292
#endif
294293
};
295294

Python/gc_free_threading.c

Lines changed: 77 additions & 194 deletions
Original file line numberDiff line numberDiff line change
@@ -17,30 +17,7 @@
1717

1818
#include "pydtrace.h"
1919

20-
// Platform-specific includes for get_process_mem_usage().
21-
#ifdef _WIN32
22-
#include <windows.h>
23-
#include <psapi.h> // For GetProcessMemoryInfo
24-
#elif defined(__linux__)
25-
#include <unistd.h> // For sysconf, getpid
26-
#elif defined(__APPLE__)
27-
#include <mach/mach.h>
28-
#include <mach/task.h> // Required for TASK_VM_INFO
29-
#include <unistd.h> // For sysconf, getpid
30-
#elif defined(__FreeBSD__)
31-
#include <sys/types.h>
32-
#include <sys/sysctl.h>
33-
#include <sys/user.h> // Requires sys/user.h for kinfo_proc definition
34-
#include <kvm.h>
35-
#include <unistd.h> // For sysconf, getpid
36-
#include <fcntl.h> // For O_RDONLY
37-
#include <limits.h> // For _POSIX2_LINE_MAX
38-
#elif defined(__OpenBSD__)
39-
#include <sys/types.h>
40-
#include <sys/sysctl.h>
41-
#include <sys/user.h> // For kinfo_proc
42-
#include <unistd.h> // For sysconf, getpid
43-
#endif
20+
#include "pycore_mimalloc.h" // mi_heap_visit_blocks()
4421

4522
// enable the "mark alive" pass of GC
4623
#define GC_ENABLE_MARK_ALIVE 1
@@ -2016,188 +1993,93 @@ cleanup_worklist(struct worklist *worklist)
20161993
}
20171994
}
20181995

2019-
// Return the memory usage (typically RSS + swap) of the process, in units of
2020-
// KB. Returns -1 if this operation is not supported or on failure.
2021-
static Py_ssize_t
2022-
get_process_mem_usage(void)
2023-
{
2024-
#ifdef _WIN32
2025-
// Windows implementation using GetProcessMemoryInfo
2026-
// Returns WorkingSetSize + PagefileUsage
2027-
PROCESS_MEMORY_COUNTERS pmc;
2028-
HANDLE hProcess = GetCurrentProcess();
2029-
if (NULL == hProcess) {
2030-
// Should not happen for the current process
2031-
return -1;
2032-
}
2033-
2034-
// GetProcessMemoryInfo returns non-zero on success
2035-
if (GetProcessMemoryInfo(hProcess, &pmc, sizeof(pmc))) {
2036-
// Values are in bytes, convert to KB.
2037-
return (Py_ssize_t)((pmc.WorkingSetSize + pmc.PagefileUsage) / 1024);
2038-
}
2039-
else {
2040-
return -1;
2041-
}
1996+
// Visitor for get_all_mimalloc_used_kb(): called once per heap area.
1997+
struct count_used_area_args {
1998+
Py_ssize_t total_bytes;
1999+
};
20422000

2043-
#elif __linux__
2044-
FILE* fp = fopen("/proc/self/status", "r");
2045-
if (fp == NULL) {
2046-
return -1;
2001+
static bool
2002+
count_used_area_visitor(const mi_heap_t *heap, const mi_heap_area_t *area,
2003+
void *block, size_t block_size, void *arg)
2004+
{
2005+
if (block == NULL) {
2006+
// Called once per area when visit_all_blocks=false.
2007+
((struct count_used_area_args *)arg)->total_bytes +=
2008+
(Py_ssize_t)(area->used * area->block_size);
20472009
}
2010+
return true;
2011+
}
20482012

2049-
char line_buffer[256];
2050-
long long rss_kb = -1;
2051-
long long swap_kb = -1;
2052-
2053-
while (fgets(line_buffer, sizeof(line_buffer), fp) != NULL) {
2054-
if (rss_kb == -1 && strncmp(line_buffer, "VmRSS:", 6) == 0) {
2055-
sscanf(line_buffer + 6, "%lld", &rss_kb);
2056-
}
2057-
else if (swap_kb == -1 && strncmp(line_buffer, "VmSwap:", 7) == 0) {
2058-
sscanf(line_buffer + 7, "%lld", &swap_kb);
2013+
// Return the total bytes in use across all mimalloc heaps for all threads, in
2014+
// KB. Requires the world to be stopped so heap structures are stable.
2015+
static Py_ssize_t
2016+
get_all_mimalloc_used_kb(PyInterpreterState *interp)
2017+
{
2018+
assert(interp->stoptheworld.world_stopped);
2019+
struct count_used_area_args args = {0};
2020+
HEAD_LOCK(&_PyRuntime);
2021+
_Py_FOR_EACH_TSTATE_UNLOCKED(interp, p) {
2022+
struct _mimalloc_thread_state *m = &((_PyThreadStateImpl *)p)->mimalloc;
2023+
if (!_Py_atomic_load_int(&m->initialized)) {
2024+
continue;
20592025
}
2060-
if (rss_kb != -1 && swap_kb != -1) {
2061-
break; // Found both
2026+
for (int h = 0; h < _Py_MIMALLOC_HEAP_COUNT; h++) {
2027+
mi_heap_visit_blocks(&m->heaps[h], false,
2028+
count_used_area_visitor, &args);
20622029
}
20632030
}
2064-
fclose(fp);
2065-
2066-
if (rss_kb != -1 && swap_kb != -1) {
2067-
return (Py_ssize_t)(rss_kb + swap_kb);
2068-
}
2069-
return -1;
2070-
2071-
#elif defined(__APPLE__)
2072-
// --- MacOS (Darwin) ---
2073-
// Returns phys_footprint (RAM + compressed memory)
2074-
task_vm_info_data_t vm_info;
2075-
mach_msg_type_number_t count = TASK_VM_INFO_COUNT;
2076-
kern_return_t kerr;
2077-
2078-
kerr = task_info(mach_task_self(), TASK_VM_INFO, (task_info_t)&vm_info, &count);
2079-
if (kerr != KERN_SUCCESS) {
2080-
return -1;
2081-
}
2082-
// phys_footprint is in bytes. Convert to KB.
2083-
return (Py_ssize_t)(vm_info.phys_footprint / 1024);
2084-
2085-
#elif defined(__FreeBSD__)
2086-
// NOTE: Returns RSS only. Per-process swap usage isn't readily available
2087-
long page_size_kb = sysconf(_SC_PAGESIZE) / 1024;
2088-
if (page_size_kb <= 0) {
2089-
return -1;
2090-
}
2091-
2092-
// Using /dev/null for vmcore avoids needing dump file.
2093-
// NULL for kernel file uses running kernel.
2094-
char errbuf[_POSIX2_LINE_MAX]; // For kvm error messages
2095-
kvm_t *kd = kvm_openfiles(NULL, "/dev/null", NULL, O_RDONLY, errbuf);
2096-
if (kd == NULL) {
2097-
return -1;
2098-
}
2099-
2100-
// KERN_PROC_PID filters for the specific process ID
2101-
// n_procs will contain the number of processes returned (should be 1 or 0)
2102-
pid_t pid = getpid();
2103-
int n_procs;
2104-
struct kinfo_proc *kp = kvm_getprocs(kd, KERN_PROC_PID, pid, &n_procs);
2105-
if (kp == NULL) {
2106-
kvm_close(kd);
2107-
return -1;
2108-
}
2109-
2110-
Py_ssize_t rss_kb = -1;
2111-
if (n_procs > 0) {
2112-
// kp[0] contains the info for our process
2113-
// ki_rssize is in pages. Convert to KB.
2114-
rss_kb = (Py_ssize_t)kp->ki_rssize * page_size_kb;
2115-
}
2116-
else {
2117-
// Process with PID not found, shouldn't happen for self.
2118-
rss_kb = -1;
2119-
}
2120-
2121-
kvm_close(kd);
2122-
return rss_kb;
2123-
2124-
#elif defined(__OpenBSD__)
2125-
// NOTE: Returns RSS only. Per-process swap usage isn't readily available
2126-
long page_size_kb = sysconf(_SC_PAGESIZE) / 1024;
2127-
if (page_size_kb <= 0) {
2128-
return -1;
2129-
}
2130-
2131-
struct kinfo_proc kp;
2132-
pid_t pid = getpid();
2133-
int mib[6];
2134-
size_t len = sizeof(kp);
2135-
2136-
mib[0] = CTL_KERN;
2137-
mib[1] = KERN_PROC;
2138-
mib[2] = KERN_PROC_PID;
2139-
mib[3] = pid;
2140-
mib[4] = sizeof(struct kinfo_proc); // size of the structure we want
2141-
mib[5] = 1; // want 1 structure back
2142-
if (sysctl(mib, 6, &kp, &len, NULL, 0) == -1) {
2143-
return -1;
2144-
}
2145-
2146-
if (len > 0) {
2147-
// p_vm_rssize is in pages on OpenBSD. Convert to KB.
2148-
return (Py_ssize_t)kp.p_vm_rssize * page_size_kb;
2149-
}
2150-
else {
2151-
// Process info not returned
2152-
return -1;
2153-
}
2154-
#else
2155-
// Unsupported platform
2156-
return -1;
2157-
#endif
2031+
mi_abandoned_pool_t *pool = &interp->mimalloc.abandoned_pool;
2032+
// Only GC page tags are supported by _mi_abandoned_pool_visit_blocks.
2033+
_mi_abandoned_pool_visit_blocks(pool, _Py_MIMALLOC_HEAP_GC, false,
2034+
count_used_area_visitor, &args);
2035+
_mi_abandoned_pool_visit_blocks(pool, _Py_MIMALLOC_HEAP_GC_PRE, false,
2036+
count_used_area_visitor, &args);
2037+
HEAD_UNLOCK(&_PyRuntime);
2038+
return args.total_bytes / 1024;
21582039
}
21592040

2041+
// Decide whether memory usage has grown enough to warrant a collection.
2042+
// Stops the world to measure mimalloc heap usage accurately; OS-level RSS
2043+
// is unreliable since mimalloc reuses pages without returning them.
21602044
static bool
2161-
gc_should_collect_mem_usage(GCState *gcstate)
2045+
gc_should_collect_mem_usage(PyThreadState *tstate)
21622046
{
2163-
Py_ssize_t mem = get_process_mem_usage();
2164-
if (mem < 0) {
2165-
// Reading process memory usage is not support or failed.
2166-
return true;
2167-
}
2047+
PyInterpreterState *interp = tstate->interp;
2048+
GCState *gcstate = &interp->gc;
21682049
int threshold = gcstate->young.threshold;
2169-
Py_ssize_t deferred = _Py_atomic_load_ssize_relaxed(&gcstate->deferred_count);
2170-
if (deferred > threshold * 40) {
2171-
// Too many new container objects since last GC, even though memory use
2172-
// might not have increased much. This is intended to avoid resource
2173-
// exhaustion if some objects consume resources but don't result in a
2174-
// memory usage increase. We use 40x as the factor here because older
2175-
// versions of Python would do full collections after roughly every
2176-
// 70,000 new container objects.
2050+
2051+
if (gcstate->deferred_count > threshold * 40) {
2052+
// Too many new container objects since last GC, even though memory
2053+
// use might not have increased much. This avoids resource
2054+
// exhaustion if some objects consume resources but don't result in
2055+
// a memory usage increase. We use 40x here because older versions
2056+
// of Python would do full collections after roughly every 70,000
2057+
// new container objects.
21772058
return true;
21782059
}
2179-
Py_ssize_t last_mem = _Py_atomic_load_ssize_relaxed(&gcstate->last_mem);
2180-
Py_ssize_t mem_threshold = Py_MAX(last_mem / 10, 128);
2181-
if ((mem - last_mem) > mem_threshold) {
2182-
// The process memory usage has increased too much, do a collection.
2060+
_PyEval_StopTheWorld(interp);
2061+
Py_ssize_t used = get_all_mimalloc_used_kb(interp);
2062+
Py_ssize_t last = gcstate->last_gc_used;
2063+
Py_ssize_t mem_threshold = Py_MAX(last / 10, 128);
2064+
if ((used - last) > mem_threshold) {
2065+
// Heap usage has grown enough, collect.
2066+
_PyEval_StartTheWorld(interp);
21832067
return true;
21842068
}
2185-
else {
2186-
// The memory usage has not increased enough, defer the collection and
2187-
// clear the young object count so we don't check memory usage again
2188-
// on the next call to gc_should_collect().
2189-
PyMutex_Lock(&gcstate->mutex);
2190-
int young_count = _Py_atomic_exchange_int(&gcstate->young.count, 0);
2191-
_Py_atomic_store_ssize_relaxed(&gcstate->deferred_count,
2192-
gcstate->deferred_count + young_count);
2193-
PyMutex_Unlock(&gcstate->mutex);
2194-
return false;
2195-
}
2069+
// Memory usage has not grown enough. Defer the collection, rolling the
2070+
// young count into deferred_count so we don't keep checking on every
2071+
// call to gc_should_collect().
2072+
int young_count = gcstate->young.count;
2073+
gcstate->young.count = 0;
2074+
gcstate->deferred_count += young_count;
2075+
_PyEval_StartTheWorld(interp);
2076+
return false;
21962077
}
21972078

21982079
static bool
2199-
gc_should_collect(GCState *gcstate)
2080+
gc_should_collect(PyThreadState *tstate)
22002081
{
2082+
GCState *gcstate = &tstate->interp->gc;
22012083
int count = _Py_atomic_load_int_relaxed(&gcstate->young.count);
22022084
int threshold = gcstate->young.threshold;
22032085
int gc_enabled = _Py_atomic_load_int_relaxed(&gcstate->enabled);
@@ -2214,7 +2096,7 @@ gc_should_collect(GCState *gcstate)
22142096
// objects.
22152097
return false;
22162098
}
2217-
return gc_should_collect_mem_usage(gcstate);
2099+
return gc_should_collect_mem_usage(tstate);
22182100
}
22192101

22202102
static void
@@ -2231,7 +2113,7 @@ record_allocation(PyThreadState *tstate)
22312113
_Py_atomic_add_int(&gcstate->young.count, (int)gc->alloc_count);
22322114
gc->alloc_count = 0;
22332115

2234-
if (gc_should_collect(gcstate) &&
2116+
if (gc_should_collect(tstate) &&
22352117
!_Py_atomic_load_int_relaxed(&gcstate->collecting))
22362118
{
22372119
_Py_ScheduleGC(tstate);
@@ -2379,10 +2261,11 @@ gc_collect_internal(PyInterpreterState *interp, struct collection_state *state,
23792261
// to be freed.
23802262
delete_garbage(state);
23812263

2382-
// Store the current memory usage, can be smaller now if breaking cycles
2383-
// freed some memory.
2384-
Py_ssize_t last_mem = get_process_mem_usage();
2385-
_Py_atomic_store_ssize_relaxed(&state->gcstate->last_mem, last_mem);
2264+
// Record mimalloc heap usage as the baseline for the next collection's
2265+
// growth check. Stop-the-world so the heap structures are stable.
2266+
_PyEval_StopTheWorld(interp);
2267+
state->gcstate->last_gc_used = get_all_mimalloc_used_kb(interp);
2268+
_PyEval_StartTheWorld(interp);
23862269

23872270
// Append objects with legacy finalizers to the "gc.garbage" list.
23882271
handle_legacy_finalizers(state);
@@ -2423,7 +2306,7 @@ gc_collect_main(PyThreadState *tstate, int generation, _PyGC_Reason reason)
24232306
return 0;
24242307
}
24252308

2426-
if (reason == _Py_GC_REASON_HEAP && !gc_should_collect(gcstate)) {
2309+
if (reason == _Py_GC_REASON_HEAP && !gc_should_collect(tstate)) {
24272310
// Don't collect if the threshold is not exceeded.
24282311
_Py_atomic_store_int(&gcstate->collecting, 0);
24292312
return 0;

0 commit comments

Comments
 (0)