Skip to content
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions Include/cpython/pystats.h
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,8 @@ typedef struct _optimization_stats {
uint64_t unknown_callee;
uint64_t trace_immediately_deopts;
uint64_t executors_invalidated;
uint64_t fitness_terminated_traces;
uint64_t best_exit_fallback;
UOpStats opcode[PYSTATS_MAX_UOP_ID + 1];
uint64_t unsupported_opcode[256];
uint64_t trace_length_hist[_Py_UOP_HIST_SIZE];
Expand Down
14 changes: 14 additions & 0 deletions Include/internal/pycore_interp_structs.h
Original file line number Diff line number Diff line change
Expand Up @@ -449,6 +449,20 @@ typedef struct _PyOptimizationConfig {
uint16_t side_exit_initial_value;
uint16_t side_exit_initial_backoff;

// Trace fitness thresholds
Comment thread
cocolato marked this conversation as resolved.
uint16_t fitness_initial;
uint16_t fitness_initial_side;
uint16_t fitness_per_instruction;
uint16_t fitness_branch_biased;
uint16_t fitness_branch_unbiased;
uint16_t fitness_backward_edge;
uint16_t fitness_frame_entry;

// Exit quality thresholds for fitness-based trace termination
uint16_t exit_quality_enter_executor;
uint16_t exit_quality_default;
uint16_t exit_quality_specializable;

// Optimization flags
bool specialization_enabled;
bool uops_optimize_enabled;
Expand Down
23 changes: 22 additions & 1 deletion Include/internal/pycore_optimizer.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,23 @@ extern "C" {
#include "pycore_optimizer_types.h"
#include <stdbool.h>

/* Default fitness configuration values for trace quality control.
* These can be overridden via PYTHON_JIT_FITNESS_* environment variables. */
#define FITNESS_PER_INSTRUCTION 2
#define FITNESS_INITIAL (UOP_MAX_TRACE_LENGTH * FITNESS_PER_INSTRUCTION)
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd stick with 2000. We really don't need traces of over 1000 uops; it is far more important to find a good stopping point than to have very long traces

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@markshannon 1000 uops is only roughly 300 uops after optimization which corresponds to roughly only 60 bytecode instructions. That's too low. A reminder that the trace recording spews out a lot of uops that is only optimized away by the optimizer.

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@cocolato I suspect richards perf regression might be due to too low trace length, I experimented with richards before and it's quite sensitive to how well the optimizer performs

Copy link
Copy Markdown
Member Author

@cocolato cocolato Apr 3, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, and I tried reducing the branch/frame penalty, which also improve richards' performance.

#define FITNESS_INITIAL_SIDE 800
#define FITNESS_BRANCH_BIASED 5
#define FITNESS_BRANCH_UNBIASED 25
#define FITNESS_BACKWARD_EDGE 80
#define FITNESS_FRAME_ENTRY 10

/* Default exit quality constants for fitness-based trace termination.
* Higher values mean better places to stop the trace.
* These can be overridden via PYTHON_JIT_EXIT_QUALITY_* environment variables. */
#define EXIT_QUALITY_ENTER_EXECUTOR 500
#define EXIT_QUALITY_DEFAULT 200
#define EXIT_QUALITY_SPECIALIZABLE 50


typedef struct _PyJitUopBuffer {
_PyUOpInstruction *start;
Expand Down Expand Up @@ -101,7 +118,11 @@ typedef struct _PyJitTracerPreviousState {
} _PyJitTracerPreviousState;

typedef struct _PyJitTracerTranslatorState {
int jump_backward_seen;
int32_t fitness; // Current trace fitness, starts high, decrements
int32_t best_exit_quality; // Best exit quality seen so far
int best_exit_buffer_pos; // Position in code_buffer of best exit (-1=none)
uint32_t best_exit_target; // Bytecode target of best exit point
int frame_depth; // Current inline depth (0 = root frame)
} _PyJitTracerTranslatorState;

typedef struct _PyJitTracerState {
Expand Down
178 changes: 173 additions & 5 deletions Python/optimizer.c
Original file line number Diff line number Diff line change
Expand Up @@ -549,8 +549,6 @@ dynamic_exit_uop[MAX_UOP_ID + 1] = {
};


#define CONFIDENCE_RANGE 1000
#define CONFIDENCE_CUTOFF 333

#ifdef Py_DEBUG
#define DPRINTF(level, ...) \
Expand Down Expand Up @@ -598,6 +596,86 @@ add_to_trace(
((uint32_t)((INSTR) - ((_Py_CODEUNIT *)(CODE)->co_code_adaptive)))


/* Compute branch bias from the 16-bit branch history register.
* Returns 0 (completely unpredictable, 50/50) to 8 (fully biased). */
static inline int
Comment thread
cocolato marked this conversation as resolved.
compute_branch_bias(uint16_t history)
{
int ones = _Py_popcount32((uint32_t)history);
return abs(ones - 8);
}

/* Compute exit quality for the current trace position.
* Higher values mean it's a better place to stop the trace. */
static inline int32_t
compute_exit_quality(_Py_CODEUNIT *target_instr, int opcode,
Comment thread
cocolato marked this conversation as resolved.
const _PyOptimizationConfig *cfg)
{
if (target_instr->op.code == ENTER_EXECUTOR) {
return (int32_t)cfg->exit_quality_enter_executor;
}
if (_PyOpcode_Caches[_PyOpcode_Deopt[opcode]] > 0) {
return (int32_t)cfg->exit_quality_specializable;
}
return (int32_t)cfg->exit_quality_default;
}

/* Try to truncate the trace to the best recorded exit point.
* Returns 1 if successful, 0 if no valid best exit exists.
* Enforces progress constraints: the fallback position must satisfy
* the minimum trace length requirements. */
static inline int
Comment thread
cocolato marked this conversation as resolved.
Outdated
try_best_exit_fallback(
_PyJitUopBuffer *trace,
_PyJitTracerTranslatorState *ts,
bool progress_needed)
{
int best_pos = ts->best_exit_buffer_pos;
if (best_pos <= 0) {
return 0;
} else if (progress_needed && best_pos <= CODE_SIZE_NO_PROGRESS) {
return 0;
} else if (!progress_needed && best_pos <= CODE_SIZE_EMPTY) {
return 0;
}
trace->next = trace->start + best_pos;
/* Caller must add terminator (_EXIT_TRACE) after this */
return 1;
}

/* Update trace fitness after translating one bytecode instruction. */
static inline void
update_trace_fitness(
Comment thread
cocolato marked this conversation as resolved.
Outdated
_PyJitTracerTranslatorState *ts,
int opcode,
_Py_CODEUNIT *target_instr,
const _PyOptimizationConfig *cfg)
{
ts->fitness -= cfg->fitness_per_instruction;

switch (opcode) {
case POP_JUMP_IF_FALSE:
case POP_JUMP_IF_TRUE:
case POP_JUMP_IF_NONE:
case POP_JUMP_IF_NOT_NONE: {
int bias = compute_branch_bias(target_instr[1].cache);
/* Linear interpolation: bias 0 → unbiased penalty, bias 8 → biased penalty */
int penalty = cfg->fitness_branch_unbiased
- (bias * (cfg->fitness_branch_unbiased - cfg->fitness_branch_biased)) / 8;
ts->fitness -= penalty;
break;
}
case JUMP_BACKWARD:
case JUMP_BACKWARD_JIT:
case JUMP_BACKWARD_NO_JIT:
ts->fitness -= cfg->fitness_backward_edge;
break;
default:
break;
}
}
Comment on lines +633 to +637
Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The newest result with lower frame penalty:

+----------------------+----------+-----------------------+
| Benchmark            | baseline | fitness               |
+======================+==========+=======================+
| raytrace             | 262 ms   | 212 ms: 1.24x faster  |
+----------------------+----------+-----------------------+
| pickle_pure_python   | 277 us   | 254 us: 1.09x faster  |
+----------------------+----------+-----------------------+
| go                   | 83.7 ms  | 78.8 ms: 1.06x faster |
+----------------------+----------+-----------------------+
| xml_etree_iterparse  | 74.1 ms  | 69.9 ms: 1.06x faster |
+----------------------+----------+-----------------------+
| xml_etree_process    | 50.4 ms  | 48.3 ms: 1.04x faster |
+----------------------+----------+-----------------------+
| xml_etree_generate   | 77.3 ms  | 74.1 ms: 1.04x faster |
+----------------------+----------+-----------------------+
| xml_etree_parse      | 122 ms   | 119 ms: 1.03x faster  |
+----------------------+----------+-----------------------+
| regex_compile        | 103 ms   | 99.7 ms: 1.03x faster |
+----------------------+----------+-----------------------+
| deltablue            | 2.19 ms  | 2.14 ms: 1.03x faster |
+----------------------+----------+-----------------------+
| unpickle_pure_python | 171 us   | 167 us: 1.02x faster  |
+----------------------+----------+-----------------------+
| regex_effbot         | 2.16 ms  | 2.12 ms: 1.02x faster |
+----------------------+----------+-----------------------+
| fannkuch             | 253 ms   | 249 ms: 1.02x faster  |
+----------------------+----------+-----------------------+
| json_loads           | 19.3 us  | 19.1 us: 1.01x faster |
+----------------------+----------+-----------------------+
| json_dumps           | 7.60 ms  | 7.66 ms: 1.01x slower |
+----------------------+----------+-----------------------+
| pidigits             | 136 ms   | 138 ms: 1.01x slower  |
+----------------------+----------+-----------------------+
| pyflate              | 267 ms   | 273 ms: 1.02x slower  |
+----------------------+----------+-----------------------+
| float                | 45.2 ms  | 46.2 ms: 1.02x slower |
+----------------------+----------+-----------------------+
| richards             | 16.5 ms  | 17.3 ms: 1.05x slower |
+----------------------+----------+-----------------------+
| Geometric mean       | (ref)    | 1.03x faster          |
+----------------------+----------+-----------------------+



static int
is_terminator(const _PyUOpInstruction *uop)
{
Expand Down Expand Up @@ -734,13 +812,25 @@ _PyJit_translate_single_bytecode_to_trace(
DPRINTF(2, "Unsupported: oparg too large\n");
unsupported:
{
// Rewind to previous instruction and replace with _EXIT_TRACE.
// If we have a high-quality best_exit (enter_executor, etc.),
Comment thread
cocolato marked this conversation as resolved.
Outdated
// prefer it over rewinding to last _SET_IP — this covers the
// main unsupported path, not just the edge case.
_PyJitTracerTranslatorState *ts_unsup = &tracer->translator_state;
if (ts_unsup->best_exit_quality > (int32_t)tstate->interp->opt_config.exit_quality_default &&
try_best_exit_fallback(trace, ts_unsup, progress_needed)) {
ADD_TO_TRACE(_EXIT_TRACE, 0, 0, ts_unsup->best_exit_target);
uop_buffer_last(trace)->operand1 = true; // is_control_flow
OPT_STAT_INC(best_exit_fallback);
DPRINTF(2, "Best-exit fallback at unsupported (pos=%d, quality=%d)\n",
ts_unsup->best_exit_buffer_pos, ts_unsup->best_exit_quality);
goto done;
}
// Fall back: rewind to last _SET_IP and replace with _DEOPT.
_PyUOpInstruction *curr = uop_buffer_last(trace);
while (curr->opcode != _SET_IP && uop_buffer_length(trace) > 2) {
trace->next--;
curr = uop_buffer_last(trace);
}
assert(curr->opcode == _SET_IP || uop_buffer_length(trace) == 2);
if (curr->opcode == _SET_IP) {
int32_t old_target = (int32_t)uop_get_target(curr);
curr->opcode = _DEOPT;
Expand All @@ -763,6 +853,39 @@ _PyJit_translate_single_bytecode_to_trace(
return 1;
}

// Fitness-based trace quality check (before reserving space for this instruction)
_PyJitTracerTranslatorState *ts = &tracer->translator_state;
int32_t eq = compute_exit_quality(target_instr, opcode,
&tstate->interp->opt_config);

// Record best exit candidate.
// Only record after minimum progress to avoid truncating to near-empty traces.
if (eq > ts->best_exit_quality &&
uop_buffer_length(trace) > CODE_SIZE_NO_PROGRESS) {
ts->best_exit_quality = eq;
ts->best_exit_buffer_pos = uop_buffer_length(trace);
ts->best_exit_target = target;
}

// Check if fitness is depleted — should we stop the trace?
if (ts->fitness < eq &&
!(progress_needed && uop_buffer_length(trace) < CODE_SIZE_NO_PROGRESS)) {
Comment thread
cocolato marked this conversation as resolved.
Outdated
// Prefer stopping at the best recorded exit point
Comment thread
cocolato marked this conversation as resolved.
Outdated
if (try_best_exit_fallback(trace, ts, progress_needed)) {
ADD_TO_TRACE(_EXIT_TRACE, 0, 0, ts->best_exit_target);
uop_buffer_last(trace)->operand1 = true; // is_control_flow
}
else {
// No valid best exit — stop at current position
ADD_TO_TRACE(_EXIT_TRACE, 0, 0, target);
uop_buffer_last(trace)->operand1 = true; // is_control_flow
Comment thread
cocolato marked this conversation as resolved.
Outdated
}
OPT_STAT_INC(fitness_terminated_traces);
DPRINTF(2, "Fitness terminated: fitness=%d < exit_quality=%d\n",
ts->fitness, eq);
goto done;
}

// One for possible _DEOPT, one because _CHECK_VALIDITY itself might _DEOPT
trace->end -= 2;

Expand Down Expand Up @@ -793,6 +916,12 @@ _PyJit_translate_single_bytecode_to_trace(
DPRINTF(2, "No room for expansions and guards (need %d, got %d)\n",
space_needed, uop_buffer_remaining_space(trace));
OPT_STAT_INC(trace_too_long);
// Try best-exit fallback before giving up
if (try_best_exit_fallback(trace, &tracer->translator_state, progress_needed)) {
ADD_TO_TRACE(_EXIT_TRACE, 0, 0, tracer->translator_state.best_exit_target);
uop_buffer_last(trace)->operand1 = true; // is_control_flow
OPT_STAT_INC(best_exit_fallback);
}
goto done;
}

Expand Down Expand Up @@ -945,6 +1074,28 @@ _PyJit_translate_single_bytecode_to_trace(
assert(next->op.code == STORE_FAST);
operand = next->op.arg;
}
else if (uop == _PUSH_FRAME) {
_PyJitTracerTranslatorState *ts_depth = &tracer->translator_state;
ts_depth->frame_depth++;
if (ts_depth->frame_depth >= MAX_ABSTRACT_FRAME_DEPTH) {
// The optimizer can't handle frames this deep,
// so there's no point continuing the trace.
DPRINTF(2, "Unsupported: frame depth %d >= MAX_ABSTRACT_FRAME_DEPTH\n",
ts_depth->frame_depth);
goto unsupported;
Copy link
Copy Markdown
Member

@Fidget-Spinner Fidget-Spinner Apr 3, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This should goto somewhere that we rewind and insert _EXIT_TRACE. The current unsupported instead inserts _DEOPT.

Perhaps try assigning int end_trace_opcode = _DEOPT, and when it hits this branch, write end_trace_opcode = _EXIT_TRACE then assign curr->opcode = end_trace_opcode;.

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This shouldn't happen at all. An assert should be all that is needed, if we have chosen the fitness parameters correctly.

}
int32_t penalty = (int32_t)tstate->interp->opt_config.fitness_frame_entry
* ts_depth->frame_depth;
ts_depth->fitness -= penalty;
}
else if (uop == _RETURN_VALUE || uop == _RETURN_GENERATOR || uop == _YIELD_VALUE) {
_PyJitTracerTranslatorState *ts_depth = &tracer->translator_state;
if (ts_depth->frame_depth <= 0) {
// Underflow
ts_depth->fitness -= (int32_t)tstate->interp->opt_config.fitness_frame_entry * 2;
Comment thread
cocolato marked this conversation as resolved.
Outdated
}
ts_depth->frame_depth = ts_depth->frame_depth <= 0 ? 0 : ts_depth->frame_depth - 1;
Comment thread
cocolato marked this conversation as resolved.
}
else if (_PyUop_Flags[uop] & HAS_RECORDS_VALUE_FLAG) {
PyObject *recorded_value = tracer->prev_state.recorded_value;
tracer->prev_state.recorded_value = NULL;
Expand Down Expand Up @@ -986,7 +1137,12 @@ _PyJit_translate_single_bytecode_to_trace(
ADD_TO_TRACE(_JUMP_TO_TOP, 0, 0, 0);
goto done;
}
DPRINTF(2, "Trace continuing\n");
// Update fitness AFTER translation, BEFORE returning to continue tracing.
// This ensures the next iteration's fitness check reflects the cost of
// all instructions translated so far.
update_trace_fitness(&tracer->translator_state, opcode, target_instr,
&tstate->interp->opt_config);
DPRINTF(2, "Trace continuing (fitness=%d)\n", tracer->translator_state.fitness);
return 1;
done:
DPRINTF(2, "Trace done\n");
Expand Down Expand Up @@ -1069,6 +1225,18 @@ _PyJit_TryInitializeTracing(
assert(curr_instr->op.code == JUMP_BACKWARD_JIT || curr_instr->op.code == RESUME_CHECK_JIT || (exit != NULL));
tracer->initial_state.jump_backward_instr = curr_instr;

// Initialize fitness tracking state
const _PyOptimizationConfig *cfg = &tstate->interp->opt_config;
_PyJitTracerTranslatorState *ts = &tracer->translator_state;
bool is_side_trace = (exit != NULL);
ts->fitness = is_side_trace
? (int32_t)cfg->fitness_initial_side
: (int32_t)cfg->fitness_initial;
ts->best_exit_quality = 0;
ts->best_exit_buffer_pos = -1;
ts->best_exit_target = 0;
ts->frame_depth = 0;

tracer->is_tracing = true;
return 1;
}
Expand Down
34 changes: 34 additions & 0 deletions Python/pystate.c
Original file line number Diff line number Diff line change
Expand Up @@ -635,6 +635,40 @@ init_interpreter(PyInterpreterState *interp,
"PYTHON_JIT_SIDE_EXIT_INITIAL_BACKOFF",
SIDE_EXIT_INITIAL_BACKOFF, 0, MAX_BACKOFF);

// Trace fitness configuration
init_policy(&interp->opt_config.fitness_initial,
"PYTHON_JIT_FITNESS_INITIAL",
FITNESS_INITIAL, 100, 10000);
init_policy(&interp->opt_config.fitness_initial_side,
"PYTHON_JIT_FITNESS_INITIAL_SIDE",
FITNESS_INITIAL_SIDE, 50, 5000);
init_policy(&interp->opt_config.fitness_per_instruction,
"PYTHON_JIT_FITNESS_PER_INSTRUCTION",
FITNESS_PER_INSTRUCTION, 0, 100);
init_policy(&interp->opt_config.fitness_branch_biased,
"PYTHON_JIT_FITNESS_BRANCH_BIASED",
FITNESS_BRANCH_BIASED, 0, 500);
init_policy(&interp->opt_config.fitness_branch_unbiased,
"PYTHON_JIT_FITNESS_BRANCH_UNBIASED",
FITNESS_BRANCH_UNBIASED, 0, 500);
init_policy(&interp->opt_config.fitness_backward_edge,
"PYTHON_JIT_FITNESS_BACKWARD_EDGE",
FITNESS_BACKWARD_EDGE, 0, 1000);
init_policy(&interp->opt_config.fitness_frame_entry,
"PYTHON_JIT_FITNESS_FRAME_ENTRY",
FITNESS_FRAME_ENTRY, 0, 1000);

// Exit quality thresholds
init_policy(&interp->opt_config.exit_quality_enter_executor,
"PYTHON_JIT_EXIT_QUALITY_ENTER_EXECUTOR",
EXIT_QUALITY_ENTER_EXECUTOR, 0, 10000);
init_policy(&interp->opt_config.exit_quality_default,
"PYTHON_JIT_EXIT_QUALITY_DEFAULT",
EXIT_QUALITY_DEFAULT, 0, 10000);
init_policy(&interp->opt_config.exit_quality_specializable,
"PYTHON_JIT_EXIT_QUALITY_SPECIALIZABLE",
EXIT_QUALITY_SPECIALIZABLE, 0, 10000);

interp->opt_config.specialization_enabled = !is_env_enabled("PYTHON_SPECIALIZATION_OFF");
interp->opt_config.uops_optimize_enabled = !is_env_disabled("PYTHON_UOPS_OPTIMIZE");
if (interp != &runtime->_main_interpreter) {
Expand Down
2 changes: 2 additions & 0 deletions Python/pystats.c
Original file line number Diff line number Diff line change
Expand Up @@ -274,6 +274,8 @@ print_optimization_stats(FILE *out, OptimizationStats *stats)
fprintf(out, "Optimization low confidence: %" PRIu64 "\n", stats->low_confidence);
fprintf(out, "Optimization unknown callee: %" PRIu64 "\n", stats->unknown_callee);
fprintf(out, "Executors invalidated: %" PRIu64 "\n", stats->executors_invalidated);
fprintf(out, "Optimization fitness terminated: %" PRIu64 "\n", stats->fitness_terminated_traces);
fprintf(out, "Optimization best exit fallback: %" PRIu64 "\n", stats->best_exit_fallback);

print_histogram(out, "Trace length", stats->trace_length_hist);
print_histogram(out, "Trace run length", stats->trace_run_length_hist);
Expand Down
Loading