From c5520d361c87a7993799e3e98db335d83aec7070 Mon Sep 17 00:00:00 2001 From: Pablo Galindo Salgado Date: Sun, 10 May 2026 18:56:27 +0100 Subject: [PATCH 1/5] gh-149584: Avoid page reads for hot profiler structs Use exact remote reads for interpreter state, thread state, and interpreter frame structs instead of pulling full remote pages into the profiler page cache. This matches the core change from python/cpython#149585. --- Modules/_remote_debugging/frames.c | 2 +- Modules/_remote_debugging/module.c | 2 +- Modules/_remote_debugging/threads.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Modules/_remote_debugging/frames.c b/Modules/_remote_debugging/frames.c index bbdfce3f7201d9..7e56576392737b 100644 --- a/Modules/_remote_debugging/frames.c +++ b/Modules/_remote_debugging/frames.c @@ -197,7 +197,7 @@ parse_frame_object( char frame[SIZEOF_INTERP_FRAME]; *address_of_code_object = 0; - Py_ssize_t bytes_read = _Py_RemoteDebug_PagedReadRemoteMemory( + Py_ssize_t bytes_read = _Py_RemoteDebug_ReadRemoteMemory( &unwinder->handle, address, SIZEOF_INTERP_FRAME, diff --git a/Modules/_remote_debugging/module.c b/Modules/_remote_debugging/module.c index efdd2e1a2d7b7a..50332645b0197e 100644 --- a/Modules/_remote_debugging/module.c +++ b/Modules/_remote_debugging/module.c @@ -537,7 +537,7 @@ _remote_debugging_RemoteUnwinder_get_stack_trace_impl(RemoteUnwinderObject *self while (current_interpreter != 0) { // Read interpreter state to get the interpreter ID char interp_state_buffer[INTERP_STATE_BUFFER_SIZE]; - if (_Py_RemoteDebug_PagedReadRemoteMemory( + if (_Py_RemoteDebug_ReadRemoteMemory( &self->handle, current_interpreter, INTERP_STATE_BUFFER_SIZE, diff --git a/Modules/_remote_debugging/threads.c b/Modules/_remote_debugging/threads.c index 4daa5e5f92bcd9..31d83f561a8ddf 100644 --- a/Modules/_remote_debugging/threads.c +++ b/Modules/_remote_debugging/threads.c @@ -303,7 +303,7 @@ unwind_stack_for_thread( StackChunkList chunks = {0}; char ts[SIZEOF_THREAD_STATE]; - int bytes_read = _Py_RemoteDebug_PagedReadRemoteMemory( + int bytes_read = _Py_RemoteDebug_ReadRemoteMemory( &unwinder->handle, *current_tstate, (size_t)unwinder->debug_offsets.thread_state.size, ts); if (bytes_read < 0) { set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to read thread state"); From 8be8d7d6a825d5ab1aaa6e25b81a31c02f90eee5 Mon Sep 17 00:00:00 2001 From: Pablo Galindo Salgado Date: Sun, 10 May 2026 18:56:57 +0100 Subject: [PATCH 2/5] gh-149584: Track live remote page cache entries The profiler clears the page cache between samples, so live entries are always packed at the front. Track the live count and only clear/search that prefix instead of scanning all 1024 slots on the hot path. --- Python/remote_debug.h | 55 +++++++++++++++++++++++-------------------- 1 file changed, 29 insertions(+), 26 deletions(-) diff --git a/Python/remote_debug.h b/Python/remote_debug.h index 6c089a834dcd40..be7fdb7cfa9428 100644 --- a/Python/remote_debug.h +++ b/Python/remote_debug.h @@ -147,6 +147,7 @@ typedef struct { int memfd; #endif page_cache_entry_t pages[MAX_PAGES]; + int page_cache_count; Py_ssize_t page_size; } proc_handle_t; @@ -185,14 +186,16 @@ _Py_RemoteDebug_FreePageCache(proc_handle_t *handle) handle->pages[i].data = NULL; handle->pages[i].valid = 0; } + handle->page_cache_count = 0; } UNUSED static void _Py_RemoteDebug_ClearCache(proc_handle_t *handle) { - for (int i = 0; i < MAX_PAGES; i++) { + for (int i = 0; i < handle->page_cache_count; i++) { handle->pages[i].valid = 0; } + handle->page_cache_count = 0; } #if defined(__APPLE__) && defined(TARGET_OS_OSX) && TARGET_OS_OSX @@ -222,6 +225,7 @@ _Py_RemoteDebug_InitProcHandle(proc_handle_t *handle, pid_t pid) { handle->memfd = -1; #endif handle->page_size = get_page_size(); + handle->page_cache_count = 0; for (int i = 0; i < MAX_PAGES; i++) { handle->pages[i].data = NULL; handle->pages[i].valid = 0; @@ -1287,8 +1291,9 @@ _Py_RemoteDebug_PagedReadRemoteMemory(proc_handle_t *handle, return _Py_RemoteDebug_ReadRemoteMemory(handle, addr, size, out); } - // Search for valid cached page - for (int i = 0; i < MAX_PAGES; i++) { + // Search only the pages used since the last clear. The cache is cleared + // between profiler samples, so entries are packed at the front. + for (int i = 0; i < handle->page_cache_count; i++) { page_cache_entry_t *entry = &handle->pages[i]; if (entry->valid && entry->page_addr == page_base) { memcpy(out, entry->data + offset_in_page, size); @@ -1296,33 +1301,31 @@ _Py_RemoteDebug_PagedReadRemoteMemory(proc_handle_t *handle, } } - // Find reusable slot - for (int i = 0; i < MAX_PAGES; i++) { - page_cache_entry_t *entry = &handle->pages[i]; - if (!entry->valid) { + if (handle->page_cache_count < MAX_PAGES) { + page_cache_entry_t *entry = &handle->pages[handle->page_cache_count]; + if (entry->data == NULL) { + entry->data = PyMem_RawMalloc(page_size); if (entry->data == NULL) { - entry->data = PyMem_RawMalloc(page_size); - if (entry->data == NULL) { - PyErr_NoMemory(); - _set_debug_exception_cause(PyExc_MemoryError, - "Cannot allocate %zu bytes for page cache entry " - "during read from PID %d at address 0x%lx", - page_size, handle->pid, addr); - return -1; - } - } - - if (_Py_RemoteDebug_ReadRemoteMemory(handle, page_base, page_size, entry->data) < 0) { - // Try to just copy the exact amount as a fallback - PyErr_Clear(); - goto fallback; + PyErr_NoMemory(); + _set_debug_exception_cause(PyExc_MemoryError, + "Cannot allocate %zu bytes for page cache entry " + "during read from PID %d at address 0x%lx", + page_size, handle->pid, addr); + return -1; } + } - entry->page_addr = page_base; - entry->valid = 1; - memcpy(out, entry->data + offset_in_page, size); - return 0; + if (_Py_RemoteDebug_ReadRemoteMemory(handle, page_base, page_size, entry->data) < 0) { + // Try to just copy the exact amount as a fallback + PyErr_Clear(); + goto fallback; } + + entry->page_addr = page_base; + entry->valid = 1; + handle->page_cache_count++; + memcpy(out, entry->data + offset_in_page, size); + return 0; } fallback: From 5dc0309fb79fa09d4f4961d589982a6cea96247c Mon Sep 17 00:00:00 2001 From: Pablo Galindo Salgado Date: Sun, 10 May 2026 18:59:37 +0100 Subject: [PATCH 3/5] gh-149584: Batch predicted profiler reads Use the frame cache to predict the next thread state and top frame address, then batch interpreter/thread/frame reads with process_vm_readv when profiling a Linux target. Reuse prefetched frame buffers in the frame walker when the prediction is valid. --- Modules/_remote_debugging/_remote_debugging.h | 13 ++- Modules/_remote_debugging/frame_cache.c | 18 ++++ Modules/_remote_debugging/frames.c | 79 +++++++++++----- Modules/_remote_debugging/module.c | 76 ++++++++++++++-- Modules/_remote_debugging/threads.c | 89 +++++++++++++++++-- Python/remote_debug.h | 43 +++++++++ 6 files changed, 285 insertions(+), 33 deletions(-) diff --git a/Modules/_remote_debugging/_remote_debugging.h b/Modules/_remote_debugging/_remote_debugging.h index 7369cd1514c296..df1f8cd1a12292 100644 --- a/Modules/_remote_debugging/_remote_debugging.h +++ b/Modules/_remote_debugging/_remote_debugging.h @@ -224,6 +224,7 @@ typedef struct { typedef struct { uint64_t thread_id; // 0 = empty slot + uintptr_t thread_state_addr; uintptr_t addrs[FRAME_CACHE_MAX_FRAMES]; Py_ssize_t num_addrs; PyObject *frame_list; // owned reference, NULL if empty @@ -302,6 +303,7 @@ typedef struct { int cache_frames; int collect_stats; // whether to collect statistics uint32_t stale_invalidation_counter; // counter for throttling frame_cache_invalidate_stale + uintptr_t cached_tstate_addr; // predicted first thread for batched reads RemoteDebuggingState *cached_state; FrameCacheEntry *frame_cache; // preallocated array of FRAME_CACHE_MAX_THREADS entries UnwinderStats stats; // statistics for performance analysis @@ -361,11 +363,14 @@ typedef struct { typedef struct { /* Inputs */ uintptr_t frame_addr; // Starting frame address + uintptr_t thread_state_addr; // Owning thread state address uintptr_t base_frame_addr; // Sentinel at bottom (for validation) uintptr_t gc_frame; // GC frame address (0 if not tracking) uintptr_t last_profiled_frame; // Last cached frame (0 if no cache) StackChunkList *chunks; // Pre-copied stack chunks int skip_first_frame; // Skip frame_addr itself (continue from its caller) + const char *prefetched_frame; // Optional already-read frame buffer + uintptr_t prefetched_frame_addr; // Remote address for prefetched_frame /* Outputs */ PyObject *frame_info; // List to append FrameInfo objects @@ -548,6 +553,7 @@ extern int process_frame_chain( extern int frame_cache_init(RemoteUnwinderObject *unwinder); extern void frame_cache_cleanup(RemoteUnwinderObject *unwinder); extern FrameCacheEntry *frame_cache_find(RemoteUnwinderObject *unwinder, uint64_t thread_id); +extern FrameCacheEntry *frame_cache_find_by_tstate(RemoteUnwinderObject *unwinder, uintptr_t tstate_addr); extern int clear_last_profiled_frames(RemoteUnwinderObject *unwinder); extern void frame_cache_invalidate_stale(RemoteUnwinderObject *unwinder, PyObject *result); extern int frame_cache_lookup_and_extend( @@ -566,6 +572,7 @@ extern int frame_cache_store( PyObject *frame_list, const uintptr_t *addrs, Py_ssize_t num_addrs, + uintptr_t thread_state_addr, uintptr_t base_frame_addr, uintptr_t last_frame_visited); @@ -605,7 +612,11 @@ extern PyObject* unwind_stack_for_thread( uintptr_t *current_tstate, uintptr_t gil_holder_tstate, uintptr_t gc_frame, - uintptr_t main_thread_tstate + uintptr_t main_thread_tstate, + const char *prefetched_tstate, + uintptr_t prefetched_tstate_addr, + const char *prefetched_frame, + uintptr_t prefetched_frame_addr ); /* Thread stopping functions (for blocking mode) */ diff --git a/Modules/_remote_debugging/frame_cache.c b/Modules/_remote_debugging/frame_cache.c index b6566d7cff7b54..d2ec63925680c6 100644 --- a/Modules/_remote_debugging/frame_cache.c +++ b/Modules/_remote_debugging/frame_cache.c @@ -53,6 +53,21 @@ frame_cache_find(RemoteUnwinderObject *unwinder, uint64_t thread_id) return NULL; } +FrameCacheEntry * +frame_cache_find_by_tstate(RemoteUnwinderObject *unwinder, uintptr_t tstate_addr) +{ + if (!unwinder->frame_cache || tstate_addr == 0) { + return NULL; + } + for (int i = 0; i < FRAME_CACHE_MAX_THREADS; i++) { + if (unwinder->frame_cache[i].thread_state_addr == tstate_addr) { + assert(unwinder->frame_cache[i].num_addrs <= FRAME_CACHE_MAX_FRAMES); + return &unwinder->frame_cache[i]; + } + } + return NULL; +} + // Allocate a cache slot for a thread // Returns NULL if cache is full (graceful degradation) static FrameCacheEntry * @@ -129,6 +144,7 @@ frame_cache_invalidate_stale(RemoteUnwinderObject *unwinder, PyObject *result) // Clear this entry Py_CLEAR(unwinder->frame_cache[i].frame_list); unwinder->frame_cache[i].thread_id = 0; + unwinder->frame_cache[i].thread_state_addr = 0; unwinder->frame_cache[i].num_addrs = 0; STATS_INC(unwinder, stale_cache_invalidations); } @@ -216,6 +232,7 @@ frame_cache_store( PyObject *frame_list, const uintptr_t *addrs, Py_ssize_t num_addrs, + uintptr_t thread_state_addr, uintptr_t base_frame_addr, uintptr_t last_frame_visited) { @@ -257,6 +274,7 @@ frame_cache_store( return -1; } entry->thread_id = thread_id; + entry->thread_state_addr = thread_state_addr; memcpy(entry->addrs, addrs, num_addrs * sizeof(uintptr_t)); entry->num_addrs = num_addrs; assert(entry->num_addrs == num_addrs); diff --git a/Modules/_remote_debugging/frames.c b/Modules/_remote_debugging/frames.c index 7e56576392737b..3a7e44f8075acc 100644 --- a/Modules/_remote_debugging/frames.c +++ b/Modules/_remote_debugging/frames.c @@ -186,30 +186,16 @@ is_frame_valid( return 1; } -int -parse_frame_object( +static int +parse_frame_buffer( RemoteUnwinderObject *unwinder, PyObject** result, - uintptr_t address, + const char *frame, uintptr_t* address_of_code_object, uintptr_t* previous_frame ) { - char frame[SIZEOF_INTERP_FRAME]; *address_of_code_object = 0; - Py_ssize_t bytes_read = _Py_RemoteDebug_ReadRemoteMemory( - &unwinder->handle, - address, - SIZEOF_INTERP_FRAME, - frame - ); - if (bytes_read < 0) { - set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to read interpreter frame"); - return -1; - } - STATS_INC(unwinder, memory_reads); - STATS_ADD(unwinder, memory_bytes_read, SIZEOF_INTERP_FRAME); - *previous_frame = GET_MEMBER(uintptr_t, frame, unwinder->debug_offsets.interpreter_frame.previous); uintptr_t code_object = GET_MEMBER_NO_TAG(uintptr_t, frame, unwinder->debug_offsets.interpreter_frame.executable); int frame_valid = is_frame_valid(unwinder, (uintptr_t)frame, code_object); @@ -237,6 +223,31 @@ parse_frame_object( return parse_code_object(unwinder, result, &code_ctx); } +int +parse_frame_object( + RemoteUnwinderObject *unwinder, + PyObject** result, + uintptr_t address, + uintptr_t* address_of_code_object, + uintptr_t* previous_frame +) { + char frame[SIZEOF_INTERP_FRAME]; + Py_ssize_t bytes_read = _Py_RemoteDebug_ReadRemoteMemory( + &unwinder->handle, + address, + SIZEOF_INTERP_FRAME, + frame + ); + if (bytes_read < 0) { + set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to read interpreter frame"); + return -1; + } + STATS_INC(unwinder, memory_reads); + STATS_ADD(unwinder, memory_bytes_read, SIZEOF_INTERP_FRAME); + + return parse_frame_buffer(unwinder, result, frame, address_of_code_object, previous_frame); +} + int parse_frame_from_chunks( RemoteUnwinderObject *unwinder, @@ -312,15 +323,32 @@ process_frame_chain( } assert(frame_count <= MAX_FRAMES); - if (parse_frame_from_chunks(unwinder, &frame, frame_addr, &next_frame_addr, &stackpointer, ctx->chunks) < 0) { + if (ctx->chunks && ctx->chunks->count > 0) { + if (parse_frame_from_chunks(unwinder, &frame, frame_addr, &next_frame_addr, &stackpointer, ctx->chunks) == 0) { + goto parsed_frame; + } PyErr_Clear(); + } + { uintptr_t address_of_code_object = 0; - if (parse_frame_object(unwinder, &frame, frame_addr, &address_of_code_object, &next_frame_addr) < 0) { + int parse_result; + if (ctx->prefetched_frame && ctx->prefetched_frame_addr == frame_addr) { + parse_result = parse_frame_buffer( + unwinder, &frame, ctx->prefetched_frame, + &address_of_code_object, &next_frame_addr); + } + else { + parse_result = parse_frame_object( + unwinder, &frame, frame_addr, + &address_of_code_object, &next_frame_addr); + } + if (parse_result < 0) { set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to parse frame object in chain"); return -1; } } +parsed_frame: // Skip first frame if requested (used for cache miss continuation) if (ctx->skip_first_frame && frame_count == 1) { Py_XDECREF(frame); @@ -501,8 +529,16 @@ try_full_cache_hit( PyObject *current_frame = NULL; uintptr_t code_object_addr = 0; uintptr_t previous_frame = 0; - int parse_result = parse_frame_object(unwinder, ¤t_frame, ctx->frame_addr, + int parse_result; + if (ctx->prefetched_frame && ctx->prefetched_frame_addr == ctx->frame_addr) { + parse_result = parse_frame_buffer(unwinder, ¤t_frame, + ctx->prefetched_frame, &code_object_addr, &previous_frame); + } + else { + parse_result = parse_frame_object(unwinder, ¤t_frame, ctx->frame_addr, + &code_object_addr, &previous_frame); + } if (parse_result < 0) { return -1; } @@ -606,7 +642,8 @@ collect_frames_with_cache( } if (frame_cache_store(unwinder, thread_id, ctx->frame_info, ctx->frame_addrs, ctx->num_addrs, - ctx->base_frame_addr, ctx->last_frame_visited) < 0) { + ctx->thread_state_addr, ctx->base_frame_addr, + ctx->last_frame_visited) < 0) { return -1; } diff --git a/Modules/_remote_debugging/module.c b/Modules/_remote_debugging/module.c index 50332645b0197e..170fa8aa069c2b 100644 --- a/Modules/_remote_debugging/module.c +++ b/Modules/_remote_debugging/module.c @@ -360,6 +360,7 @@ _remote_debugging_RemoteUnwinder___init___impl(RemoteUnwinderObject *self, self->cache_frames = cache_frames; self->collect_stats = stats; self->stale_invalidation_counter = 0; + self->cached_tstate_addr = 0; self->debug = debug; self->only_active_thread = only_active_thread; self->mode = mode; @@ -473,6 +474,46 @@ _remote_debugging_RemoteUnwinder___init___impl(RemoteUnwinderObject *self, return 0; } +static int +read_interp_state_and_maybe_thread_frame( + RemoteUnwinderObject *unwinder, + uintptr_t interpreter_addr, + char *interp_state_buffer, + uintptr_t predicted_tstate_addr, + char *tstate_buffer, + int *tstate_read, + uintptr_t predicted_frame_addr, + char *frame_buffer, + int *frame_read) +{ + *tstate_read = 0; + *frame_read = 0; + if (predicted_tstate_addr != 0) { + size_t tstate_size = (size_t)unwinder->debug_offsets.thread_state.size; + _Py_RemoteReadSegment segments[3] = { + {interpreter_addr, interp_state_buffer, INTERP_STATE_BUFFER_SIZE}, + {predicted_tstate_addr, tstate_buffer, tstate_size}, + {predicted_frame_addr, frame_buffer, SIZEOF_INTERP_FRAME}, + }; + int nsegs = predicted_frame_addr != 0 ? 3 : 2; + Py_ssize_t nread = _Py_RemoteDebug_BatchedReadRemoteMemory( + &unwinder->handle, segments, nsegs); + if (nread >= (Py_ssize_t)INTERP_STATE_BUFFER_SIZE) { + Py_ssize_t with_tstate = (Py_ssize_t)INTERP_STATE_BUFFER_SIZE + + (Py_ssize_t)tstate_size; + *tstate_read = nread >= with_tstate; + *frame_read = nsegs == 3 + && nread == with_tstate + (Py_ssize_t)SIZEOF_INTERP_FRAME; + return 0; + } + } + return _Py_RemoteDebug_ReadRemoteMemory( + &unwinder->handle, + interpreter_addr, + INTERP_STATE_BUFFER_SIZE, + interp_state_buffer); +} + /*[clinic input] @permit_long_docstring_body @critical_section @@ -537,11 +578,29 @@ _remote_debugging_RemoteUnwinder_get_stack_trace_impl(RemoteUnwinderObject *self while (current_interpreter != 0) { // Read interpreter state to get the interpreter ID char interp_state_buffer[INTERP_STATE_BUFFER_SIZE]; - if (_Py_RemoteDebug_ReadRemoteMemory( - &self->handle, + char prefetched_tstate[SIZEOF_THREAD_STATE]; + char prefetched_frame[SIZEOF_INTERP_FRAME]; + int have_prefetched_tstate = 0; + int have_prefetched_frame = 0; + uintptr_t predicted_tstate_addr = self->cache_frames ? self->cached_tstate_addr : 0; + uintptr_t predicted_frame_addr = 0; + if (predicted_tstate_addr != 0) { + FrameCacheEntry *entry = frame_cache_find_by_tstate(self, predicted_tstate_addr); + if (entry && entry->num_addrs > 0) { + predicted_frame_addr = entry->addrs[0]; + } + } + + if (read_interp_state_and_maybe_thread_frame( + self, current_interpreter, - INTERP_STATE_BUFFER_SIZE, - interp_state_buffer) < 0) { + interp_state_buffer, + predicted_tstate_addr, + prefetched_tstate, + &have_prefetched_tstate, + predicted_frame_addr, + prefetched_frame, + &have_prefetched_frame) < 0) { set_exception_cause(self, PyExc_RuntimeError, "Failed to read interpreter state buffer"); Py_CLEAR(result); goto exit; @@ -611,6 +670,9 @@ _remote_debugging_RemoteUnwinder_get_stack_trace_impl(RemoteUnwinderObject *self // Target specific thread (only process first interpreter) current_tstate = self->tstate_addr; } + if (current_tstate != 0) { + self->cached_tstate_addr = current_tstate; + } // Acquire main thread state information uintptr_t main_thread_tstate = GET_MEMBER(uintptr_t, interp_state_buffer, @@ -621,7 +683,11 @@ _remote_debugging_RemoteUnwinder_get_stack_trace_impl(RemoteUnwinderObject *self PyObject* frame_info = unwind_stack_for_thread(self, ¤t_tstate, gil_holder_tstate, gc_frame, - main_thread_tstate); + main_thread_tstate, + have_prefetched_tstate ? prefetched_tstate : NULL, + predicted_tstate_addr, + have_prefetched_frame ? prefetched_frame : NULL, + predicted_frame_addr); if (!frame_info) { // Check if this was an intentional skip due to mode-based filtering if ((self->mode == PROFILING_MODE_CPU || self->mode == PROFILING_MODE_GIL || diff --git a/Modules/_remote_debugging/threads.c b/Modules/_remote_debugging/threads.c index 31d83f561a8ddf..fa89cf6406736d 100644 --- a/Modules/_remote_debugging/threads.c +++ b/Modules/_remote_debugging/threads.c @@ -289,13 +289,44 @@ typedef struct { unsigned int :24; } _thread_status; +static int +read_thread_state_and_maybe_frame( + RemoteUnwinderObject *unwinder, + uintptr_t tstate_addr, + size_t tstate_size, + char *tstate_buffer, + uintptr_t predicted_frame_addr, + char *frame_buffer, + int *frame_read) +{ + *frame_read = 0; + if (predicted_frame_addr != 0) { + _Py_RemoteReadSegment segments[2] = { + {tstate_addr, tstate_buffer, tstate_size}, + {predicted_frame_addr, frame_buffer, SIZEOF_INTERP_FRAME}, + }; + Py_ssize_t nread = _Py_RemoteDebug_BatchedReadRemoteMemory( + &unwinder->handle, segments, 2); + if (nread >= (Py_ssize_t)tstate_size) { + *frame_read = nread == (Py_ssize_t)(tstate_size + SIZEOF_INTERP_FRAME); + return 0; + } + } + return _Py_RemoteDebug_ReadRemoteMemory( + &unwinder->handle, tstate_addr, tstate_size, tstate_buffer); +} + PyObject* unwind_stack_for_thread( RemoteUnwinderObject *unwinder, uintptr_t *current_tstate, uintptr_t gil_holder_tstate, uintptr_t gc_frame, - uintptr_t main_thread_tstate + uintptr_t main_thread_tstate, + const char *prefetched_tstate, + uintptr_t prefetched_tstate_addr, + const char *prefetched_frame, + uintptr_t prefetched_frame_addr ) { PyObject *frame_info = NULL; PyObject *thread_id = NULL; @@ -303,14 +334,57 @@ unwind_stack_for_thread( StackChunkList chunks = {0}; char ts[SIZEOF_THREAD_STATE]; - int bytes_read = _Py_RemoteDebug_ReadRemoteMemory( - &unwinder->handle, *current_tstate, (size_t)unwinder->debug_offsets.thread_state.size, ts); - if (bytes_read < 0) { - set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to read thread state"); - goto error; + char local_prefetched_frame[SIZEOF_INTERP_FRAME]; + const char *prefetched_frame_for_ctx = NULL; + int have_prefetched_frame = 0; + uintptr_t predicted_frame_addr = 0; + if (prefetched_tstate && prefetched_tstate_addr == *current_tstate) { + memcpy(ts, prefetched_tstate, (size_t)unwinder->debug_offsets.thread_state.size); + if (prefetched_frame && prefetched_frame_addr != 0) { + have_prefetched_frame = 1; + prefetched_frame_for_ctx = prefetched_frame; + predicted_frame_addr = prefetched_frame_addr; + } + } + else if (unwinder->cache_frames) { + FrameCacheEntry *entry = frame_cache_find_by_tstate(unwinder, *current_tstate); + if (entry && entry->num_addrs > 0) { + predicted_frame_addr = entry->addrs[0]; + } + + int bytes_read = read_thread_state_and_maybe_frame( + unwinder, + *current_tstate, + (size_t)unwinder->debug_offsets.thread_state.size, + ts, + predicted_frame_addr, + local_prefetched_frame, + &have_prefetched_frame); + if (bytes_read < 0) { + set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to read thread state"); + goto error; + } + if (have_prefetched_frame) { + prefetched_frame_for_ctx = local_prefetched_frame; + } + } + else { + int bytes_read = _Py_RemoteDebug_ReadRemoteMemory( + &unwinder->handle, + *current_tstate, + (size_t)unwinder->debug_offsets.thread_state.size, + ts); + if (bytes_read < 0) { + set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to read thread state"); + goto error; + } } STATS_INC(unwinder, memory_reads); STATS_ADD(unwinder, memory_bytes_read, unwinder->debug_offsets.thread_state.size); + if (have_prefetched_frame) { + STATS_INC(unwinder, memory_reads); + STATS_ADD(unwinder, memory_bytes_read, SIZEOF_INTERP_FRAME); + } long tid = GET_MEMBER(long, ts, unwinder->debug_offsets.thread_state.native_thread_id); @@ -432,9 +506,12 @@ unwind_stack_for_thread( uintptr_t addrs[FRAME_CACHE_MAX_FRAMES]; FrameWalkContext ctx = { .frame_addr = frame_addr, + .thread_state_addr = *current_tstate, .base_frame_addr = base_frame_addr, .gc_frame = gc_frame, .chunks = &chunks, + .prefetched_frame = have_prefetched_frame ? prefetched_frame_for_ctx : NULL, + .prefetched_frame_addr = predicted_frame_addr, .frame_info = frame_info, .frame_addrs = addrs, .num_addrs = 0, diff --git a/Python/remote_debug.h b/Python/remote_debug.h index be7fdb7cfa9428..7b2c4f3bcb8077 100644 --- a/Python/remote_debug.h +++ b/Python/remote_debug.h @@ -1333,6 +1333,49 @@ _Py_RemoteDebug_PagedReadRemoteMemory(proc_handle_t *handle, return _Py_RemoteDebug_ReadRemoteMemory(handle, addr, size, out); } +typedef struct { + uintptr_t remote_addr; + void *local_buf; + size_t size; +} _Py_RemoteReadSegment; + +#define _PY_REMOTE_DEBUG_MAX_BATCHED_SEGMENTS 4 + +// Batched read of multiple remote regions in a single syscall when supported. +// Returns total bytes read (>= 0) on success, -1 if batched reads are +// unavailable or the syscall failed. Callers compare the return value against +// cumulative segment sizes to determine which segments were fully populated. +UNUSED static Py_ssize_t +_Py_RemoteDebug_BatchedReadRemoteMemory( + proc_handle_t *handle, + const _Py_RemoteReadSegment *segments, + int nsegs) +{ +#if defined(__linux__) && HAVE_PROCESS_VM_READV + if (handle->memfd == -1 + && nsegs > 0 + && nsegs <= _PY_REMOTE_DEBUG_MAX_BATCHED_SEGMENTS) { + struct iovec local[_PY_REMOTE_DEBUG_MAX_BATCHED_SEGMENTS]; + struct iovec remote[_PY_REMOTE_DEBUG_MAX_BATCHED_SEGMENTS]; + for (int i = 0; i < nsegs; i++) { + local[i].iov_base = segments[i].local_buf; + local[i].iov_len = segments[i].size; + remote[i].iov_base = (void *)segments[i].remote_addr; + remote[i].iov_len = segments[i].size; + } + ssize_t nread = process_vm_readv(handle->pid, local, nsegs, remote, nsegs, 0); + if (nread >= 0) { + return (Py_ssize_t)nread; + } + } +#else + (void)handle; + (void)segments; + (void)nsegs; +#endif + return -1; +} + UNUSED static int _Py_RemoteDebug_ReadDebugOffsets( proc_handle_t *handle, From c69a0f361700db334d41615c9c6b647f5ef353c1 Mon Sep 17 00:00:00 2001 From: Pablo Galindo Salgado Date: Sun, 10 May 2026 19:00:33 +0100 Subject: [PATCH 4/5] gh-149584: Reuse profiler result objects Cache the last FrameInfo tuple per code object/instruction offset, reuse cached thread id objects, and append cached parent frames directly on full frame-cache hits. This cuts Python allocation churn in the steady-state profiler path. --- Modules/_remote_debugging/_remote_debugging.h | 3 +++ Modules/_remote_debugging/code_objects.c | 13 +++++++++++ Modules/_remote_debugging/frame_cache.c | 8 +++++++ Modules/_remote_debugging/frames.c | 22 +++++-------------- Modules/_remote_debugging/module.c | 1 + Modules/_remote_debugging/threads.c | 14 +++++++++--- 6 files changed, 41 insertions(+), 20 deletions(-) diff --git a/Modules/_remote_debugging/_remote_debugging.h b/Modules/_remote_debugging/_remote_debugging.h index df1f8cd1a12292..422fe9e09873e8 100644 --- a/Modules/_remote_debugging/_remote_debugging.h +++ b/Modules/_remote_debugging/_remote_debugging.h @@ -215,6 +215,8 @@ typedef struct { PyObject *file_name; int first_lineno; PyObject *linetable; // bytes + PyObject *last_frame_info; + ptrdiff_t last_addrq; uintptr_t addr_code_adaptive; } CachedCodeMetadata; @@ -227,6 +229,7 @@ typedef struct { uintptr_t thread_state_addr; uintptr_t addrs[FRAME_CACHE_MAX_FRAMES]; Py_ssize_t num_addrs; + PyObject *thread_id_obj; // owned reference, NULL if empty PyObject *frame_list; // owned reference, NULL if empty } FrameCacheEntry; diff --git a/Modules/_remote_debugging/code_objects.c b/Modules/_remote_debugging/code_objects.c index 7b95c0f2d4fa8d..2ac6edb3f662f6 100644 --- a/Modules/_remote_debugging/code_objects.c +++ b/Modules/_remote_debugging/code_objects.c @@ -405,6 +405,8 @@ parse_code_object(RemoteUnwinderObject *unwinder, meta->func_name = func; meta->file_name = file; meta->linetable = linetable; + meta->last_frame_info = NULL; + meta->last_addrq = -1; meta->first_lineno = GET_MEMBER(int, code_object, unwinder->debug_offsets.code_object.firstlineno); meta->addr_code_adaptive = real_address + (uintptr_t)unwinder->debug_offsets.code_object.co_code_adaptive; @@ -482,6 +484,12 @@ parse_code_object(RemoteUnwinderObject *unwinder, addrq = (uint16_t *)ip - (uint16_t *)meta->addr_code_adaptive; #endif ; // Empty statement to avoid C23 extension warning + + if (!unwinder->opcodes && meta->last_frame_info != NULL && meta->last_addrq == addrq) { + *result = Py_NewRef(meta->last_frame_info); + return 0; + } + LocationInfo info = {0}; bool ok = parse_linetable(addrq, PyBytes_AS_STRING(meta->linetable), PyBytes_GET_SIZE(meta->linetable), @@ -529,6 +537,11 @@ parse_code_object(RemoteUnwinderObject *unwinder, goto error; } + if (!unwinder->opcodes) { + Py_XSETREF(meta->last_frame_info, Py_NewRef(tuple)); + meta->last_addrq = addrq; + } + *result = tuple; return 0; diff --git a/Modules/_remote_debugging/frame_cache.c b/Modules/_remote_debugging/frame_cache.c index d2ec63925680c6..19fc406bca9ac9 100644 --- a/Modules/_remote_debugging/frame_cache.c +++ b/Modules/_remote_debugging/frame_cache.c @@ -30,6 +30,7 @@ frame_cache_cleanup(RemoteUnwinderObject *unwinder) return; } for (int i = 0; i < FRAME_CACHE_MAX_THREADS; i++) { + Py_CLEAR(unwinder->frame_cache[i].thread_id_obj); Py_CLEAR(unwinder->frame_cache[i].frame_list); } PyMem_Free(unwinder->frame_cache); @@ -142,6 +143,7 @@ frame_cache_invalidate_stale(RemoteUnwinderObject *unwinder, PyObject *result) } if (!found) { // Clear this entry + Py_CLEAR(unwinder->frame_cache[i].thread_id_obj); Py_CLEAR(unwinder->frame_cache[i].frame_list); unwinder->frame_cache[i].thread_id = 0; unwinder->frame_cache[i].thread_state_addr = 0; @@ -275,6 +277,12 @@ frame_cache_store( } entry->thread_id = thread_id; entry->thread_state_addr = thread_state_addr; + if (entry->thread_id_obj == NULL) { + entry->thread_id_obj = PyLong_FromUnsignedLongLong(thread_id); + if (entry->thread_id_obj == NULL) { + return -1; + } + } memcpy(entry->addrs, addrs, num_addrs * sizeof(uintptr_t)); entry->num_addrs = num_addrs; assert(entry->num_addrs == num_addrs); diff --git a/Modules/_remote_debugging/frames.c b/Modules/_remote_debugging/frames.c index 3a7e44f8075acc..1eafd3588db50b 100644 --- a/Modules/_remote_debugging/frames.c +++ b/Modules/_remote_debugging/frames.c @@ -543,35 +543,23 @@ try_full_cache_hit( return -1; } - Py_ssize_t cached_size = PyList_GET_SIZE(entry->frame_list); - PyObject *parent_slice = NULL; - if (cached_size > 1) { - parent_slice = PyList_GetSlice(entry->frame_list, 1, cached_size); - if (!parent_slice) { - Py_XDECREF(current_frame); - return -1; - } - } - if (current_frame != NULL) { if (PyList_Append(ctx->frame_info, current_frame) < 0) { Py_DECREF(current_frame); - Py_XDECREF(parent_slice); return -1; } Py_DECREF(current_frame); STATS_ADD(unwinder, frames_read_from_memory, 1); } - if (parent_slice) { - Py_ssize_t cur_size = PyList_GET_SIZE(ctx->frame_info); - int result = PyList_SetSlice(ctx->frame_info, cur_size, cur_size, parent_slice); - Py_DECREF(parent_slice); - if (result < 0) { + Py_ssize_t cached_size = PyList_GET_SIZE(entry->frame_list); + for (Py_ssize_t i = 1; i < cached_size; i++) { + PyObject *cached_frame = PyList_GET_ITEM(entry->frame_list, i); + if (PyList_Append(ctx->frame_info, cached_frame) < 0) { return -1; } - STATS_ADD(unwinder, frames_read_from_cache, cached_size - 1); } + STATS_ADD(unwinder, frames_read_from_cache, cached_size > 1 ? cached_size - 1 : 0); STATS_INC(unwinder, frame_cache_hits); return 1; diff --git a/Modules/_remote_debugging/module.c b/Modules/_remote_debugging/module.c index 170fa8aa069c2b..25928b658fd147 100644 --- a/Modules/_remote_debugging/module.c +++ b/Modules/_remote_debugging/module.c @@ -166,6 +166,7 @@ cached_code_metadata_destroy(void *ptr) Py_DECREF(meta->func_name); Py_DECREF(meta->file_name); Py_DECREF(meta->linetable); + Py_XDECREF(meta->last_frame_info); PyMem_RawFree(meta); } diff --git a/Modules/_remote_debugging/threads.c b/Modules/_remote_debugging/threads.c index fa89cf6406736d..3e3164094480ea 100644 --- a/Modules/_remote_debugging/threads.c +++ b/Modules/_remote_debugging/threads.c @@ -546,10 +546,18 @@ unwind_stack_for_thread( *current_tstate = GET_MEMBER(uintptr_t, ts, unwinder->debug_offsets.thread_state.next); - thread_id = PyLong_FromLongLong(tid); + if (unwinder->cache_frames) { + FrameCacheEntry *entry = frame_cache_find(unwinder, (uint64_t)tid); + if (entry && entry->thread_id_obj) { + thread_id = Py_NewRef(entry->thread_id_obj); + } + } if (thread_id == NULL) { - set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to create thread ID"); - goto error; + thread_id = PyLong_FromLongLong(tid); + if (thread_id == NULL) { + set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to create thread ID"); + goto error; + } } RemoteDebuggingState *state = RemoteDebugging_GetStateFromObject((PyObject*)unwinder); From 7a85c9a720d4687cf97ccd9f4c36bb6d5cd95a53 Mon Sep 17 00:00:00 2001 From: Pablo Galindo Salgado Date: Sun, 10 May 2026 19:27:03 +0100 Subject: [PATCH 5/5] gh-149584: Add NEWS for Tachyon profiler overhead fix --- .../Library/2026-05-10-19-26-50.gh-issue-149584.x7Qm9A.rst | 4 ++++ 1 file changed, 4 insertions(+) create mode 100644 Misc/NEWS.d/next/Library/2026-05-10-19-26-50.gh-issue-149584.x7Qm9A.rst diff --git a/Misc/NEWS.d/next/Library/2026-05-10-19-26-50.gh-issue-149584.x7Qm9A.rst b/Misc/NEWS.d/next/Library/2026-05-10-19-26-50.gh-issue-149584.x7Qm9A.rst new file mode 100644 index 00000000000000..6734250fdd6af3 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2026-05-10-19-26-50.gh-issue-149584.x7Qm9A.rst @@ -0,0 +1,4 @@ +Fix excessive overhead in the Tachyon profiler when inspecting a remote +process by avoiding repeated remote page-cache scans, batching predicted +remote reads, and reusing cached profiler result objects. Patch by Pablo +Galindo and Maurycy Pawłowski-Wieroński.