-
-
Notifications
You must be signed in to change notification settings - Fork 34.5k
gh-146073: Add fitness/exit quality mechanism for JIT trace frontend #147966
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 3 commits
1bfa176
2f9438a
709c0a1
ef6ac24
21f7122
b99fe61
d09afb5
c9957c3
9447546
7d3e4c4
2c1b5e0
2409b2f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -15,6 +15,23 @@ extern "C" { | |
| #include "pycore_optimizer_types.h" | ||
| #include <stdbool.h> | ||
|
|
||
| /* Default fitness configuration values for trace quality control. | ||
| * These can be overridden via PYTHON_JIT_FITNESS_* environment variables. */ | ||
| #define FITNESS_PER_INSTRUCTION 2 | ||
| #define FITNESS_INITIAL (UOP_MAX_TRACE_LENGTH * FITNESS_PER_INSTRUCTION) | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'd stick with 2000. We really don't need traces of over 1000 uops; it is far more important to find a good stopping point than to have very long traces
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @markshannon 1000 uops is only roughly 300 uops after optimization which corresponds to roughly only 60 bytecode instructions. That's too low. A reminder that the trace recording spews out a lot of uops that is only optimized away by the optimizer.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @cocolato I suspect richards perf regression might be due to too low trace length, I experimented with richards before and it's quite sensitive to how well the optimizer performs
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, and I tried reducing the branch/frame penalty, which also improve richards' performance. |
||
| #define FITNESS_INITIAL_SIDE 800 | ||
| #define FITNESS_BRANCH_BIASED 5 | ||
| #define FITNESS_BRANCH_UNBIASED 25 | ||
| #define FITNESS_BACKWARD_EDGE 80 | ||
| #define FITNESS_FRAME_ENTRY 10 | ||
|
|
||
| /* Default exit quality constants for fitness-based trace termination. | ||
| * Higher values mean better places to stop the trace. | ||
| * These can be overridden via PYTHON_JIT_EXIT_QUALITY_* environment variables. */ | ||
| #define EXIT_QUALITY_ENTER_EXECUTOR 500 | ||
| #define EXIT_QUALITY_DEFAULT 200 | ||
| #define EXIT_QUALITY_SPECIALIZABLE 50 | ||
|
|
||
|
|
||
| typedef struct _PyJitUopBuffer { | ||
| _PyUOpInstruction *start; | ||
|
|
@@ -101,7 +118,11 @@ typedef struct _PyJitTracerPreviousState { | |
| } _PyJitTracerPreviousState; | ||
|
|
||
| typedef struct _PyJitTracerTranslatorState { | ||
| int jump_backward_seen; | ||
| int32_t fitness; // Current trace fitness, starts high, decrements | ||
| int32_t best_exit_quality; // Best exit quality seen so far | ||
| int best_exit_buffer_pos; // Position in code_buffer of best exit (-1=none) | ||
| uint32_t best_exit_target; // Bytecode target of best exit point | ||
| int frame_depth; // Current inline depth (0 = root frame) | ||
| } _PyJitTracerTranslatorState; | ||
|
|
||
| typedef struct _PyJitTracerState { | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -549,8 +549,6 @@ dynamic_exit_uop[MAX_UOP_ID + 1] = { | |
| }; | ||
|
|
||
|
|
||
| #define CONFIDENCE_RANGE 1000 | ||
| #define CONFIDENCE_CUTOFF 333 | ||
|
|
||
| #ifdef Py_DEBUG | ||
| #define DPRINTF(level, ...) \ | ||
|
|
@@ -598,6 +596,86 @@ add_to_trace( | |
| ((uint32_t)((INSTR) - ((_Py_CODEUNIT *)(CODE)->co_code_adaptive))) | ||
|
|
||
|
|
||
| /* Compute branch bias from the 16-bit branch history register. | ||
| * Returns 0 (completely unpredictable, 50/50) to 8 (fully biased). */ | ||
| static inline int | ||
|
cocolato marked this conversation as resolved.
|
||
| compute_branch_bias(uint16_t history) | ||
| { | ||
| int ones = _Py_popcount32((uint32_t)history); | ||
| return abs(ones - 8); | ||
| } | ||
|
|
||
| /* Compute exit quality for the current trace position. | ||
| * Higher values mean it's a better place to stop the trace. */ | ||
| static inline int32_t | ||
| compute_exit_quality(_Py_CODEUNIT *target_instr, int opcode, | ||
|
cocolato marked this conversation as resolved.
|
||
| const _PyOptimizationConfig *cfg) | ||
| { | ||
| if (target_instr->op.code == ENTER_EXECUTOR) { | ||
| return (int32_t)cfg->exit_quality_enter_executor; | ||
| } | ||
| if (_PyOpcode_Caches[_PyOpcode_Deopt[opcode]] > 0) { | ||
| return (int32_t)cfg->exit_quality_specializable; | ||
| } | ||
| return (int32_t)cfg->exit_quality_default; | ||
| } | ||
|
|
||
| /* Try to truncate the trace to the best recorded exit point. | ||
| * Returns 1 if successful, 0 if no valid best exit exists. | ||
| * Enforces progress constraints: the fallback position must satisfy | ||
| * the minimum trace length requirements. */ | ||
| static inline int | ||
|
cocolato marked this conversation as resolved.
Outdated
|
||
| try_best_exit_fallback( | ||
| _PyJitUopBuffer *trace, | ||
| _PyJitTracerTranslatorState *ts, | ||
| bool progress_needed) | ||
| { | ||
| int best_pos = ts->best_exit_buffer_pos; | ||
| if (best_pos <= 0) { | ||
| return 0; | ||
| } else if (progress_needed && best_pos <= CODE_SIZE_NO_PROGRESS) { | ||
| return 0; | ||
| } else if (!progress_needed && best_pos <= CODE_SIZE_EMPTY) { | ||
| return 0; | ||
| } | ||
| trace->next = trace->start + best_pos; | ||
| /* Caller must add terminator (_EXIT_TRACE) after this */ | ||
| return 1; | ||
| } | ||
|
|
||
| /* Update trace fitness after translating one bytecode instruction. */ | ||
| static inline void | ||
| update_trace_fitness( | ||
|
cocolato marked this conversation as resolved.
Outdated
|
||
| _PyJitTracerTranslatorState *ts, | ||
| int opcode, | ||
| _Py_CODEUNIT *target_instr, | ||
| const _PyOptimizationConfig *cfg) | ||
| { | ||
| ts->fitness -= cfg->fitness_per_instruction; | ||
|
|
||
| switch (opcode) { | ||
| case POP_JUMP_IF_FALSE: | ||
| case POP_JUMP_IF_TRUE: | ||
| case POP_JUMP_IF_NONE: | ||
| case POP_JUMP_IF_NOT_NONE: { | ||
| int bias = compute_branch_bias(target_instr[1].cache); | ||
| /* Linear interpolation: bias 0 → unbiased penalty, bias 8 → biased penalty */ | ||
| int penalty = cfg->fitness_branch_unbiased | ||
| - (bias * (cfg->fitness_branch_unbiased - cfg->fitness_branch_biased)) / 8; | ||
| ts->fitness -= penalty; | ||
| break; | ||
| } | ||
| case JUMP_BACKWARD: | ||
| case JUMP_BACKWARD_JIT: | ||
| case JUMP_BACKWARD_NO_JIT: | ||
| ts->fitness -= cfg->fitness_backward_edge; | ||
| break; | ||
| default: | ||
| break; | ||
| } | ||
| } | ||
|
Comment on lines
+633
to
+637
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The newest result with lower frame penalty: |
||
|
|
||
|
|
||
| static int | ||
| is_terminator(const _PyUOpInstruction *uop) | ||
| { | ||
|
|
@@ -734,13 +812,25 @@ _PyJit_translate_single_bytecode_to_trace( | |
| DPRINTF(2, "Unsupported: oparg too large\n"); | ||
| unsupported: | ||
| { | ||
| // Rewind to previous instruction and replace with _EXIT_TRACE. | ||
| // If we have a high-quality best_exit (enter_executor, etc.), | ||
|
cocolato marked this conversation as resolved.
Outdated
|
||
| // prefer it over rewinding to last _SET_IP — this covers the | ||
| // main unsupported path, not just the edge case. | ||
| _PyJitTracerTranslatorState *ts_unsup = &tracer->translator_state; | ||
| if (ts_unsup->best_exit_quality > (int32_t)tstate->interp->opt_config.exit_quality_default && | ||
| try_best_exit_fallback(trace, ts_unsup, progress_needed)) { | ||
| ADD_TO_TRACE(_EXIT_TRACE, 0, 0, ts_unsup->best_exit_target); | ||
| uop_buffer_last(trace)->operand1 = true; // is_control_flow | ||
| OPT_STAT_INC(best_exit_fallback); | ||
| DPRINTF(2, "Best-exit fallback at unsupported (pos=%d, quality=%d)\n", | ||
| ts_unsup->best_exit_buffer_pos, ts_unsup->best_exit_quality); | ||
| goto done; | ||
| } | ||
| // Fall back: rewind to last _SET_IP and replace with _DEOPT. | ||
| _PyUOpInstruction *curr = uop_buffer_last(trace); | ||
| while (curr->opcode != _SET_IP && uop_buffer_length(trace) > 2) { | ||
| trace->next--; | ||
| curr = uop_buffer_last(trace); | ||
| } | ||
| assert(curr->opcode == _SET_IP || uop_buffer_length(trace) == 2); | ||
| if (curr->opcode == _SET_IP) { | ||
| int32_t old_target = (int32_t)uop_get_target(curr); | ||
| curr->opcode = _DEOPT; | ||
|
|
@@ -763,6 +853,39 @@ _PyJit_translate_single_bytecode_to_trace( | |
| return 1; | ||
| } | ||
|
|
||
| // Fitness-based trace quality check (before reserving space for this instruction) | ||
| _PyJitTracerTranslatorState *ts = &tracer->translator_state; | ||
| int32_t eq = compute_exit_quality(target_instr, opcode, | ||
| &tstate->interp->opt_config); | ||
|
|
||
| // Record best exit candidate. | ||
| // Only record after minimum progress to avoid truncating to near-empty traces. | ||
| if (eq > ts->best_exit_quality && | ||
| uop_buffer_length(trace) > CODE_SIZE_NO_PROGRESS) { | ||
| ts->best_exit_quality = eq; | ||
| ts->best_exit_buffer_pos = uop_buffer_length(trace); | ||
| ts->best_exit_target = target; | ||
| } | ||
|
|
||
| // Check if fitness is depleted — should we stop the trace? | ||
| if (ts->fitness < eq && | ||
| !(progress_needed && uop_buffer_length(trace) < CODE_SIZE_NO_PROGRESS)) { | ||
|
cocolato marked this conversation as resolved.
Outdated
|
||
| // Prefer stopping at the best recorded exit point | ||
|
cocolato marked this conversation as resolved.
Outdated
|
||
| if (try_best_exit_fallback(trace, ts, progress_needed)) { | ||
| ADD_TO_TRACE(_EXIT_TRACE, 0, 0, ts->best_exit_target); | ||
| uop_buffer_last(trace)->operand1 = true; // is_control_flow | ||
| } | ||
| else { | ||
| // No valid best exit — stop at current position | ||
| ADD_TO_TRACE(_EXIT_TRACE, 0, 0, target); | ||
| uop_buffer_last(trace)->operand1 = true; // is_control_flow | ||
|
cocolato marked this conversation as resolved.
Outdated
|
||
| } | ||
| OPT_STAT_INC(fitness_terminated_traces); | ||
| DPRINTF(2, "Fitness terminated: fitness=%d < exit_quality=%d\n", | ||
| ts->fitness, eq); | ||
| goto done; | ||
| } | ||
|
|
||
| // One for possible _DEOPT, one because _CHECK_VALIDITY itself might _DEOPT | ||
| trace->end -= 2; | ||
|
|
||
|
|
@@ -793,6 +916,12 @@ _PyJit_translate_single_bytecode_to_trace( | |
| DPRINTF(2, "No room for expansions and guards (need %d, got %d)\n", | ||
| space_needed, uop_buffer_remaining_space(trace)); | ||
| OPT_STAT_INC(trace_too_long); | ||
| // Try best-exit fallback before giving up | ||
| if (try_best_exit_fallback(trace, &tracer->translator_state, progress_needed)) { | ||
| ADD_TO_TRACE(_EXIT_TRACE, 0, 0, tracer->translator_state.best_exit_target); | ||
| uop_buffer_last(trace)->operand1 = true; // is_control_flow | ||
| OPT_STAT_INC(best_exit_fallback); | ||
| } | ||
| goto done; | ||
| } | ||
|
|
||
|
|
@@ -945,6 +1074,28 @@ _PyJit_translate_single_bytecode_to_trace( | |
| assert(next->op.code == STORE_FAST); | ||
| operand = next->op.arg; | ||
| } | ||
| else if (uop == _PUSH_FRAME) { | ||
| _PyJitTracerTranslatorState *ts_depth = &tracer->translator_state; | ||
| ts_depth->frame_depth++; | ||
| if (ts_depth->frame_depth >= MAX_ABSTRACT_FRAME_DEPTH) { | ||
| // The optimizer can't handle frames this deep, | ||
| // so there's no point continuing the trace. | ||
| DPRINTF(2, "Unsupported: frame depth %d >= MAX_ABSTRACT_FRAME_DEPTH\n", | ||
| ts_depth->frame_depth); | ||
| goto unsupported; | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This should goto somewhere that we rewind and insert Perhaps try assigning
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This shouldn't happen at all. An assert should be all that is needed, if we have chosen the fitness parameters correctly. |
||
| } | ||
| int32_t penalty = (int32_t)tstate->interp->opt_config.fitness_frame_entry | ||
| * ts_depth->frame_depth; | ||
| ts_depth->fitness -= penalty; | ||
| } | ||
| else if (uop == _RETURN_VALUE || uop == _RETURN_GENERATOR || uop == _YIELD_VALUE) { | ||
| _PyJitTracerTranslatorState *ts_depth = &tracer->translator_state; | ||
| if (ts_depth->frame_depth <= 0) { | ||
| // Underflow | ||
| ts_depth->fitness -= (int32_t)tstate->interp->opt_config.fitness_frame_entry * 2; | ||
|
cocolato marked this conversation as resolved.
Outdated
|
||
| } | ||
| ts_depth->frame_depth = ts_depth->frame_depth <= 0 ? 0 : ts_depth->frame_depth - 1; | ||
|
cocolato marked this conversation as resolved.
|
||
| } | ||
| else if (_PyUop_Flags[uop] & HAS_RECORDS_VALUE_FLAG) { | ||
| PyObject *recorded_value = tracer->prev_state.recorded_value; | ||
| tracer->prev_state.recorded_value = NULL; | ||
|
|
@@ -986,7 +1137,12 @@ _PyJit_translate_single_bytecode_to_trace( | |
| ADD_TO_TRACE(_JUMP_TO_TOP, 0, 0, 0); | ||
| goto done; | ||
| } | ||
| DPRINTF(2, "Trace continuing\n"); | ||
| // Update fitness AFTER translation, BEFORE returning to continue tracing. | ||
| // This ensures the next iteration's fitness check reflects the cost of | ||
| // all instructions translated so far. | ||
| update_trace_fitness(&tracer->translator_state, opcode, target_instr, | ||
| &tstate->interp->opt_config); | ||
| DPRINTF(2, "Trace continuing (fitness=%d)\n", tracer->translator_state.fitness); | ||
| return 1; | ||
| done: | ||
| DPRINTF(2, "Trace done\n"); | ||
|
|
@@ -1069,6 +1225,18 @@ _PyJit_TryInitializeTracing( | |
| assert(curr_instr->op.code == JUMP_BACKWARD_JIT || curr_instr->op.code == RESUME_CHECK_JIT || (exit != NULL)); | ||
| tracer->initial_state.jump_backward_instr = curr_instr; | ||
|
|
||
| // Initialize fitness tracking state | ||
| const _PyOptimizationConfig *cfg = &tstate->interp->opt_config; | ||
| _PyJitTracerTranslatorState *ts = &tracer->translator_state; | ||
| bool is_side_trace = (exit != NULL); | ||
| ts->fitness = is_side_trace | ||
| ? (int32_t)cfg->fitness_initial_side | ||
| : (int32_t)cfg->fitness_initial; | ||
| ts->best_exit_quality = 0; | ||
| ts->best_exit_buffer_pos = -1; | ||
| ts->best_exit_target = 0; | ||
| ts->frame_depth = 0; | ||
|
|
||
| tracer->is_tracing = true; | ||
| return 1; | ||
| } | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.