Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
Fix excessive overhead in the Tachyon profiler when inspecting a remote
process by avoiding repeated remote page-cache scans, batching predicted
remote reads, and reusing cached profiler result objects. Patch by Pablo
Galindo and Maurycy Pawłowski-Wieroński.
16 changes: 15 additions & 1 deletion Modules/_remote_debugging/_remote_debugging.h
Original file line number Diff line number Diff line change
Expand Up @@ -215,6 +215,8 @@ typedef struct {
PyObject *file_name;
int first_lineno;
PyObject *linetable; // bytes
PyObject *last_frame_info;
ptrdiff_t last_addrq;
uintptr_t addr_code_adaptive;
} CachedCodeMetadata;

Expand All @@ -224,8 +226,10 @@ typedef struct {

typedef struct {
uint64_t thread_id; // 0 = empty slot
uintptr_t thread_state_addr;
uintptr_t addrs[FRAME_CACHE_MAX_FRAMES];
Py_ssize_t num_addrs;
PyObject *thread_id_obj; // owned reference, NULL if empty
PyObject *frame_list; // owned reference, NULL if empty
} FrameCacheEntry;

Expand Down Expand Up @@ -302,6 +306,7 @@ typedef struct {
int cache_frames;
int collect_stats; // whether to collect statistics
uint32_t stale_invalidation_counter; // counter for throttling frame_cache_invalidate_stale
uintptr_t cached_tstate_addr; // predicted first thread for batched reads
RemoteDebuggingState *cached_state;
FrameCacheEntry *frame_cache; // preallocated array of FRAME_CACHE_MAX_THREADS entries
UnwinderStats stats; // statistics for performance analysis
Expand Down Expand Up @@ -361,11 +366,14 @@ typedef struct {
typedef struct {
/* Inputs */
uintptr_t frame_addr; // Starting frame address
uintptr_t thread_state_addr; // Owning thread state address
uintptr_t base_frame_addr; // Sentinel at bottom (for validation)
uintptr_t gc_frame; // GC frame address (0 if not tracking)
uintptr_t last_profiled_frame; // Last cached frame (0 if no cache)
StackChunkList *chunks; // Pre-copied stack chunks
int skip_first_frame; // Skip frame_addr itself (continue from its caller)
const char *prefetched_frame; // Optional already-read frame buffer
uintptr_t prefetched_frame_addr; // Remote address for prefetched_frame

/* Outputs */
PyObject *frame_info; // List to append FrameInfo objects
Expand Down Expand Up @@ -548,6 +556,7 @@ extern int process_frame_chain(
extern int frame_cache_init(RemoteUnwinderObject *unwinder);
extern void frame_cache_cleanup(RemoteUnwinderObject *unwinder);
extern FrameCacheEntry *frame_cache_find(RemoteUnwinderObject *unwinder, uint64_t thread_id);
extern FrameCacheEntry *frame_cache_find_by_tstate(RemoteUnwinderObject *unwinder, uintptr_t tstate_addr);
extern int clear_last_profiled_frames(RemoteUnwinderObject *unwinder);
extern void frame_cache_invalidate_stale(RemoteUnwinderObject *unwinder, PyObject *result);
extern int frame_cache_lookup_and_extend(
Expand All @@ -566,6 +575,7 @@ extern int frame_cache_store(
PyObject *frame_list,
const uintptr_t *addrs,
Py_ssize_t num_addrs,
uintptr_t thread_state_addr,
uintptr_t base_frame_addr,
uintptr_t last_frame_visited);

Expand Down Expand Up @@ -605,7 +615,11 @@ extern PyObject* unwind_stack_for_thread(
uintptr_t *current_tstate,
uintptr_t gil_holder_tstate,
uintptr_t gc_frame,
uintptr_t main_thread_tstate
uintptr_t main_thread_tstate,
const char *prefetched_tstate,
uintptr_t prefetched_tstate_addr,
const char *prefetched_frame,
uintptr_t prefetched_frame_addr
);

/* Thread stopping functions (for blocking mode) */
Expand Down
13 changes: 13 additions & 0 deletions Modules/_remote_debugging/code_objects.c
Original file line number Diff line number Diff line change
Expand Up @@ -405,6 +405,8 @@ parse_code_object(RemoteUnwinderObject *unwinder,
meta->func_name = func;
meta->file_name = file;
meta->linetable = linetable;
meta->last_frame_info = NULL;
meta->last_addrq = -1;
meta->first_lineno = GET_MEMBER(int, code_object, unwinder->debug_offsets.code_object.firstlineno);
meta->addr_code_adaptive = real_address + (uintptr_t)unwinder->debug_offsets.code_object.co_code_adaptive;

Expand Down Expand Up @@ -482,6 +484,12 @@ parse_code_object(RemoteUnwinderObject *unwinder,
addrq = (uint16_t *)ip - (uint16_t *)meta->addr_code_adaptive;
#endif
; // Empty statement to avoid C23 extension warning

if (!unwinder->opcodes && meta->last_frame_info != NULL && meta->last_addrq == addrq) {
*result = Py_NewRef(meta->last_frame_info);
return 0;
}

LocationInfo info = {0};
bool ok = parse_linetable(addrq, PyBytes_AS_STRING(meta->linetable),
PyBytes_GET_SIZE(meta->linetable),
Expand Down Expand Up @@ -529,6 +537,11 @@ parse_code_object(RemoteUnwinderObject *unwinder,
goto error;
}

if (!unwinder->opcodes) {
Py_XSETREF(meta->last_frame_info, Py_NewRef(tuple));
meta->last_addrq = addrq;
}

*result = tuple;
return 0;

Expand Down
26 changes: 26 additions & 0 deletions Modules/_remote_debugging/frame_cache.c
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ frame_cache_cleanup(RemoteUnwinderObject *unwinder)
return;
}
for (int i = 0; i < FRAME_CACHE_MAX_THREADS; i++) {
Py_CLEAR(unwinder->frame_cache[i].thread_id_obj);
Py_CLEAR(unwinder->frame_cache[i].frame_list);
}
PyMem_Free(unwinder->frame_cache);
Expand All @@ -53,6 +54,21 @@ frame_cache_find(RemoteUnwinderObject *unwinder, uint64_t thread_id)
return NULL;
}

FrameCacheEntry *
frame_cache_find_by_tstate(RemoteUnwinderObject *unwinder, uintptr_t tstate_addr)
{
if (!unwinder->frame_cache || tstate_addr == 0) {
return NULL;
}
for (int i = 0; i < FRAME_CACHE_MAX_THREADS; i++) {
if (unwinder->frame_cache[i].thread_state_addr == tstate_addr) {
assert(unwinder->frame_cache[i].num_addrs <= FRAME_CACHE_MAX_FRAMES);
return &unwinder->frame_cache[i];
}
}
return NULL;
}

// Allocate a cache slot for a thread
// Returns NULL if cache is full (graceful degradation)
static FrameCacheEntry *
Expand Down Expand Up @@ -127,8 +143,10 @@ frame_cache_invalidate_stale(RemoteUnwinderObject *unwinder, PyObject *result)
}
if (!found) {
// Clear this entry
Py_CLEAR(unwinder->frame_cache[i].thread_id_obj);
Py_CLEAR(unwinder->frame_cache[i].frame_list);
unwinder->frame_cache[i].thread_id = 0;
unwinder->frame_cache[i].thread_state_addr = 0;
unwinder->frame_cache[i].num_addrs = 0;
STATS_INC(unwinder, stale_cache_invalidations);
}
Expand Down Expand Up @@ -216,6 +234,7 @@ frame_cache_store(
PyObject *frame_list,
const uintptr_t *addrs,
Py_ssize_t num_addrs,
uintptr_t thread_state_addr,
uintptr_t base_frame_addr,
uintptr_t last_frame_visited)
{
Expand Down Expand Up @@ -257,6 +276,13 @@ frame_cache_store(
return -1;
}
entry->thread_id = thread_id;
entry->thread_state_addr = thread_state_addr;
if (entry->thread_id_obj == NULL) {
entry->thread_id_obj = PyLong_FromUnsignedLongLong(thread_id);
if (entry->thread_id_obj == NULL) {
return -1;
}
}
memcpy(entry->addrs, addrs, num_addrs * sizeof(uintptr_t));
entry->num_addrs = num_addrs;
assert(entry->num_addrs == num_addrs);
Expand Down
101 changes: 63 additions & 38 deletions Modules/_remote_debugging/frames.c
Original file line number Diff line number Diff line change
Expand Up @@ -186,30 +186,16 @@ is_frame_valid(
return 1;
}

int
parse_frame_object(
static int
parse_frame_buffer(
RemoteUnwinderObject *unwinder,
PyObject** result,
uintptr_t address,
const char *frame,
uintptr_t* address_of_code_object,
uintptr_t* previous_frame
) {
char frame[SIZEOF_INTERP_FRAME];
*address_of_code_object = 0;

Py_ssize_t bytes_read = _Py_RemoteDebug_PagedReadRemoteMemory(
&unwinder->handle,
address,
SIZEOF_INTERP_FRAME,
frame
);
if (bytes_read < 0) {
set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to read interpreter frame");
return -1;
}
STATS_INC(unwinder, memory_reads);
STATS_ADD(unwinder, memory_bytes_read, SIZEOF_INTERP_FRAME);

*previous_frame = GET_MEMBER(uintptr_t, frame, unwinder->debug_offsets.interpreter_frame.previous);
uintptr_t code_object = GET_MEMBER_NO_TAG(uintptr_t, frame, unwinder->debug_offsets.interpreter_frame.executable);
int frame_valid = is_frame_valid(unwinder, (uintptr_t)frame, code_object);
Expand Down Expand Up @@ -237,6 +223,31 @@ parse_frame_object(
return parse_code_object(unwinder, result, &code_ctx);
}

int
parse_frame_object(
RemoteUnwinderObject *unwinder,
PyObject** result,
uintptr_t address,
uintptr_t* address_of_code_object,
uintptr_t* previous_frame
) {
char frame[SIZEOF_INTERP_FRAME];
Py_ssize_t bytes_read = _Py_RemoteDebug_ReadRemoteMemory(
&unwinder->handle,
address,
SIZEOF_INTERP_FRAME,
frame
);
if (bytes_read < 0) {
set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to read interpreter frame");
return -1;
}
STATS_INC(unwinder, memory_reads);
STATS_ADD(unwinder, memory_bytes_read, SIZEOF_INTERP_FRAME);

return parse_frame_buffer(unwinder, result, frame, address_of_code_object, previous_frame);
}

int
parse_frame_from_chunks(
RemoteUnwinderObject *unwinder,
Expand Down Expand Up @@ -312,15 +323,32 @@ process_frame_chain(
}
assert(frame_count <= MAX_FRAMES);

if (parse_frame_from_chunks(unwinder, &frame, frame_addr, &next_frame_addr, &stackpointer, ctx->chunks) < 0) {
if (ctx->chunks && ctx->chunks->count > 0) {
if (parse_frame_from_chunks(unwinder, &frame, frame_addr, &next_frame_addr, &stackpointer, ctx->chunks) == 0) {
goto parsed_frame;
}
PyErr_Clear();
}
{
uintptr_t address_of_code_object = 0;
if (parse_frame_object(unwinder, &frame, frame_addr, &address_of_code_object, &next_frame_addr) < 0) {
int parse_result;
if (ctx->prefetched_frame && ctx->prefetched_frame_addr == frame_addr) {
parse_result = parse_frame_buffer(
unwinder, &frame, ctx->prefetched_frame,
&address_of_code_object, &next_frame_addr);
}
else {
parse_result = parse_frame_object(
unwinder, &frame, frame_addr,
&address_of_code_object, &next_frame_addr);
}
if (parse_result < 0) {
set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to parse frame object in chain");
return -1;
}
}

parsed_frame:
// Skip first frame if requested (used for cache miss continuation)
if (ctx->skip_first_frame && frame_count == 1) {
Py_XDECREF(frame);
Expand Down Expand Up @@ -501,41 +529,37 @@ try_full_cache_hit(
PyObject *current_frame = NULL;
uintptr_t code_object_addr = 0;
uintptr_t previous_frame = 0;
int parse_result = parse_frame_object(unwinder, &current_frame, ctx->frame_addr,
int parse_result;
if (ctx->prefetched_frame && ctx->prefetched_frame_addr == ctx->frame_addr) {
parse_result = parse_frame_buffer(unwinder, &current_frame,
ctx->prefetched_frame,
&code_object_addr, &previous_frame);
}
else {
parse_result = parse_frame_object(unwinder, &current_frame, ctx->frame_addr,
&code_object_addr, &previous_frame);
}
if (parse_result < 0) {
return -1;
}

Py_ssize_t cached_size = PyList_GET_SIZE(entry->frame_list);
PyObject *parent_slice = NULL;
if (cached_size > 1) {
parent_slice = PyList_GetSlice(entry->frame_list, 1, cached_size);
if (!parent_slice) {
Py_XDECREF(current_frame);
return -1;
}
}

if (current_frame != NULL) {
if (PyList_Append(ctx->frame_info, current_frame) < 0) {
Py_DECREF(current_frame);
Py_XDECREF(parent_slice);
return -1;
}
Py_DECREF(current_frame);
STATS_ADD(unwinder, frames_read_from_memory, 1);
}

if (parent_slice) {
Py_ssize_t cur_size = PyList_GET_SIZE(ctx->frame_info);
int result = PyList_SetSlice(ctx->frame_info, cur_size, cur_size, parent_slice);
Py_DECREF(parent_slice);
if (result < 0) {
Py_ssize_t cached_size = PyList_GET_SIZE(entry->frame_list);
for (Py_ssize_t i = 1; i < cached_size; i++) {
PyObject *cached_frame = PyList_GET_ITEM(entry->frame_list, i);
if (PyList_Append(ctx->frame_info, cached_frame) < 0) {
return -1;
}
STATS_ADD(unwinder, frames_read_from_cache, cached_size - 1);
}
STATS_ADD(unwinder, frames_read_from_cache, cached_size > 1 ? cached_size - 1 : 0);

STATS_INC(unwinder, frame_cache_hits);
return 1;
Expand Down Expand Up @@ -606,7 +630,8 @@ collect_frames_with_cache(
}

if (frame_cache_store(unwinder, thread_id, ctx->frame_info, ctx->frame_addrs, ctx->num_addrs,
ctx->base_frame_addr, ctx->last_frame_visited) < 0) {
ctx->thread_state_addr, ctx->base_frame_addr,
ctx->last_frame_visited) < 0) {
return -1;
}

Expand Down
Loading
Loading