Skip to content

Commit 76c5852

Browse files
wochingeclaude
andcommitted
feat(experiment): tighten RunnerContext + RegressionError public surface
- RunnerContext no longer carries `name` or `run_name` as context-level defaults. `name` is now required on every `run_experiment` call (supports the action's directory-of-experiments mode where each script must name itself). `run_name` passes straight through to `Langfuse.run_experiment`. - RegressionError gains three typed `@overload` signatures (minimal, free-form message, structured metric/value/threshold) so type checkers enforce that `metric` and `value` are supplied together. At runtime, partial structured input falls back to the default message instead of rendering misleading `None` placeholders in PR comments. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent ef1aa71 commit 76c5852

2 files changed

Lines changed: 76 additions & 61 deletions

File tree

langfuse/experiment.py

Lines changed: 30 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
Protocol,
1818
TypedDict,
1919
Union,
20+
overload,
2021
)
2122

2223
from langfuse.api import DatasetItem
@@ -1063,10 +1064,9 @@ class RunnerContext:
10631064
Intended for use with the ``langfuse/experiment-action`` GitHub Action
10641065
(https://github.com/langfuse/experiment-action). The action builds a
10651066
``RunnerContext`` before invoking the user's ``experiment(context)``
1066-
function. Defaults set here (dataset, name, run name, metadata tags) are
1067-
applied when the user omits them on the :meth:`run_experiment` call;
1068-
users can override any default by passing the corresponding argument
1069-
explicitly.
1067+
function. Defaults set here (dataset, metadata tags) are applied when
1068+
the user omits them on the :meth:`run_experiment` call; users can
1069+
override any default by passing the corresponding argument explicitly.
10701070
"""
10711071

10721072
def __init__(
@@ -1075,8 +1075,6 @@ def __init__(
10751075
client: "Langfuse",
10761076
data: Optional[ExperimentData] = None,
10771077
dataset_version: Optional[datetime] = None,
1078-
name: Optional[str] = None,
1079-
run_name: Optional[str] = None,
10801078
metadata: Optional[Dict[str, str]] = None,
10811079
):
10821080
"""Build a ``RunnerContext`` populated with defaults for ``run_experiment``.
@@ -1098,12 +1096,6 @@ def __init__(
10981096
:meth:`run_experiment`.
10991097
dataset_version: Optional pinned dataset version. Injected by the
11001098
action when ``dataset_version`` is configured.
1101-
name: Default human-readable experiment name (e.g. the action's
1102-
``experiment_name`` input). If ``None``, the user must pass
1103-
``name=`` to :meth:`run_experiment`.
1104-
run_name: Default exact run name. The action typically derives
1105-
this from the commit SHA / PR number so that reruns produce
1106-
distinct runs in Langfuse.
11071099
metadata: Default metadata attached to every experiment trace and
11081100
the dataset run. The action injects GitHub-sourced tags (SHA,
11091101
PR link, workflow run link, branch, GH user, etc.). Merged
@@ -1113,14 +1105,12 @@ def __init__(
11131105
self.client = client
11141106
self.data = data
11151107
self.dataset_version = dataset_version
1116-
self.name = name
1117-
self.run_name = run_name
11181108
self.metadata = metadata
11191109

11201110
def run_experiment(
11211111
self,
11221112
*,
1123-
name: Optional[str] = None,
1113+
name: str,
11241114
run_name: Optional[str] = None,
11251115
description: Optional[str] = None,
11261116
data: Optional[ExperimentData] = None,
@@ -1132,19 +1122,12 @@ def run_experiment(
11321122
metadata: Optional[Dict[str, str]] = None,
11331123
_dataset_version: Optional[datetime] = None,
11341124
) -> ExperimentResult:
1135-
resolved_name = name if name is not None else self.name
1136-
if resolved_name is None:
1137-
raise ValueError(
1138-
"`name` must be provided either on the RunnerContext or the run_experiment call"
1139-
)
1140-
11411125
resolved_data = data if data is not None else self.data
11421126
if resolved_data is None:
11431127
raise ValueError(
11441128
"`data` must be provided either on the RunnerContext or the run_experiment call"
11451129
)
11461130

1147-
resolved_run_name = run_name if run_name is not None else self.run_name
11481131
resolved_dataset_version = (
11491132
_dataset_version if _dataset_version is not None else self.dataset_version
11501133
)
@@ -1156,8 +1139,8 @@ def run_experiment(
11561139
merged_metadata = {**(self.metadata or {}), **(metadata or {})}
11571140

11581141
return self.client.run_experiment(
1159-
name=resolved_name,
1160-
run_name=resolved_run_name,
1142+
name=name,
1143+
run_name=run_name,
11611144
description=description,
11621145
data=resolved_data,
11631146
task=task,
@@ -1178,8 +1161,30 @@ class RegressionError(Exception):
11781161
exception and, when ``should_fail_on_error`` is enabled, fails the
11791162
workflow run and renders a callout in the PR comment using
11801163
``metric``/``value``/``threshold`` if supplied, otherwise ``str(exc)``.
1164+
1165+
Callers choose one of three forms:
1166+
1167+
- ``RegressionError(result=r)`` — minimal, generic message.
1168+
- ``RegressionError(result=r, message="...")`` — free-form message.
1169+
- ``RegressionError(result=r, metric="acc", value=0.7, threshold=0.9)`` —
1170+
structured; ``metric`` and ``value`` must be provided together so the
1171+
action can render a targeted callout without ``None`` placeholders.
11811172
"""
11821173

1174+
@overload
1175+
def __init__(self, *, result: ExperimentResult) -> None: ...
1176+
@overload
1177+
def __init__(self, *, result: ExperimentResult, message: str) -> None: ...
1178+
@overload
1179+
def __init__(
1180+
self,
1181+
*,
1182+
result: ExperimentResult,
1183+
metric: str,
1184+
value: float,
1185+
threshold: Optional[float] = None,
1186+
message: Optional[str] = None,
1187+
) -> None: ...
11831188
def __init__(
11841189
self,
11851190
*,
@@ -1195,7 +1200,7 @@ def __init__(
11951200
self.threshold = threshold
11961201
if message is not None:
11971202
formatted = message
1198-
elif metric is not None:
1203+
elif metric is not None and value is not None:
11991204
formatted = f"Regression on `{metric}`: {value} (threshold {threshold})"
12001205
else:
12011206
formatted = "Experiment regression detected"

tests/unit/test_experiment.py

Lines changed: 46 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -30,18 +30,15 @@ def test_context_defaults_flow_through(self):
3030
ctx = _make_ctx(
3131
data=ctx_data,
3232
dataset_version=ctx_version,
33-
name="ctx-name",
34-
run_name="ctx-run",
3533
metadata={"sha": "abc123"},
3634
)
3735

38-
result = ctx.run_experiment(task=_noop_task)
36+
result = ctx.run_experiment(name="exp", task=_noop_task)
3937

4038
assert result == "result-sentinel"
4139
ctx.client.run_experiment.assert_called_once()
4240
kwargs = ctx.client.run_experiment.call_args.kwargs
43-
assert kwargs["name"] == "ctx-name"
44-
assert kwargs["run_name"] == "ctx-run"
41+
assert kwargs["name"] == "exp"
4542
assert kwargs["data"] is ctx_data
4643
assert kwargs["metadata"] == {"sha": "abc123"}
4744
assert kwargs["_dataset_version"] == ctx_version
@@ -51,22 +48,20 @@ def test_call_overrides_win(self):
5148
ctx = _make_ctx(
5249
data=[{"input": "ctx"}],
5350
dataset_version=datetime(2026, 1, 1),
54-
name="ctx-name",
55-
run_name="ctx-run",
5651
)
5752

5853
override_data = [{"input": "override"}]
5954
override_version = datetime(2026, 6, 6)
6055
ctx.run_experiment(
56+
name="exp",
6157
task=_noop_task,
62-
name="call-name",
6358
run_name="call-run",
6459
data=override_data,
6560
_dataset_version=override_version,
6661
)
6762

6863
kwargs = ctx.client.run_experiment.call_args.kwargs
69-
assert kwargs["name"] == "call-name"
64+
assert kwargs["name"] == "exp"
7065
assert kwargs["run_name"] == "call-run"
7166
assert kwargs["data"] is override_data
7267
assert kwargs["_dataset_version"] == override_version
@@ -76,58 +71,52 @@ class TestRunnerContextMetadataMerge:
7671
def test_user_keys_win_on_collision(self):
7772
ctx = _make_ctx(
7873
data=[{"input": "a"}],
79-
name="n",
8074
metadata={"sha": "abc", "branch": "main"},
8175
)
82-
ctx.run_experiment(task=_noop_task, metadata={"sha": "def", "pr": "42"})
76+
ctx.run_experiment(
77+
name="exp", task=_noop_task, metadata={"sha": "def", "pr": "42"}
78+
)
8379
assert ctx.client.run_experiment.call_args.kwargs["metadata"] == {
8480
"sha": "def",
8581
"branch": "main",
8682
"pr": "42",
8783
}
8884

8985
def test_context_metadata_only(self):
90-
ctx = _make_ctx(
91-
data=[{"input": "a"}], name="n", metadata={"sha": "abc"}
92-
)
93-
ctx.run_experiment(task=_noop_task)
86+
ctx = _make_ctx(data=[{"input": "a"}], metadata={"sha": "abc"})
87+
ctx.run_experiment(name="exp", task=_noop_task)
9488
assert ctx.client.run_experiment.call_args.kwargs["metadata"] == {"sha": "abc"}
9589

9690
def test_call_metadata_only(self):
97-
ctx = _make_ctx(data=[{"input": "a"}], name="n")
98-
ctx.run_experiment(task=_noop_task, metadata={"pr": "1"})
91+
ctx = _make_ctx(data=[{"input": "a"}])
92+
ctx.run_experiment(name="exp", task=_noop_task, metadata={"pr": "1"})
9993
assert ctx.client.run_experiment.call_args.kwargs["metadata"] == {"pr": "1"}
10094

10195
def test_both_none_stays_none(self):
102-
ctx = _make_ctx(data=[{"input": "a"}], name="n")
103-
ctx.run_experiment(task=_noop_task)
96+
ctx = _make_ctx(data=[{"input": "a"}])
97+
ctx.run_experiment(name="exp", task=_noop_task)
10498
assert ctx.client.run_experiment.call_args.kwargs["metadata"] is None
10599

106100

107101
class TestRunnerContextLocalItems:
108102
def test_local_items_pass_through_as_context_default(self):
109103
items = [{"input": "x", "expected_output": "y"}]
110-
ctx = _make_ctx(data=items, name="n")
111-
ctx.run_experiment(task=_noop_task)
104+
ctx = _make_ctx(data=items)
105+
ctx.run_experiment(name="exp", task=_noop_task)
112106
assert ctx.client.run_experiment.call_args.kwargs["data"] is items
113107

114108
def test_local_items_pass_through_as_call_override(self):
115-
ctx = _make_ctx(name="n")
109+
ctx = _make_ctx()
116110
items = [{"input": "x"}]
117-
ctx.run_experiment(task=_noop_task, data=items)
111+
ctx.run_experiment(name="exp", task=_noop_task, data=items)
118112
assert ctx.client.run_experiment.call_args.kwargs["data"] is items
119113

120114

121115
class TestRunnerContextValidation:
122-
def test_missing_name_raises(self):
123-
ctx = _make_ctx(data=[{"input": "a"}])
124-
with pytest.raises(ValueError, match="name"):
125-
ctx.run_experiment(task=_noop_task)
126-
127116
def test_missing_data_raises(self):
128-
ctx = _make_ctx(name="n")
117+
ctx = _make_ctx()
129118
with pytest.raises(ValueError, match="data"):
130-
ctx.run_experiment(task=_noop_task)
119+
ctx.run_experiment(name="exp", task=_noop_task)
131120

132121

133122
class TestRegressionError:
@@ -155,7 +144,14 @@ def test_structured_message(self):
155144
assert "0.78" in str(exc)
156145
assert "0.9" in str(exc)
157146

158-
def test_user_message_wins(self):
147+
def test_free_form_message(self):
148+
exc = RegressionError(
149+
result=MagicMock(),
150+
message="custom explanation",
151+
)
152+
assert str(exc) == "custom explanation"
153+
154+
def test_message_wins_over_structured(self):
159155
exc = RegressionError(
160156
result=MagicMock(),
161157
metric="avg_accuracy",
@@ -164,19 +160,33 @@ def test_user_message_wins(self):
164160
message="custom explanation",
165161
)
166162
assert str(exc) == "custom explanation"
163+
assert exc.metric == "avg_accuracy"
164+
assert exc.value == 0.5
165+
assert exc.threshold == 0.9
166+
167+
def test_partial_structured_falls_back_to_default(self):
168+
"""The structured overload requires ``metric`` and ``value`` together.
169+
170+
If a caller bypasses the type checker and passes only one, we fall
171+
back to the default message rather than rendering misleading
172+
``None`` placeholders in the PR comment.
173+
"""
174+
exc = RegressionError(result=MagicMock(), metric="avg_accuracy") # type: ignore[call-overload]
175+
assert str(exc) == "Experiment regression detected"
167176

168177

169178
class TestSignatureDriftGuard:
170179
"""Fails loudly if ``Langfuse.run_experiment`` grows a parameter that is
171180
not threaded through ``RunnerContext.run_experiment``.
172181
173-
The four action-relaxed params (``name``, ``run_name``, ``data``,
174-
``_dataset_version``) are allowed to diverge: the RunnerContext variant
175-
must be the ``Optional[...]`` of the client annotation so the action can
176-
inject them.
182+
``data`` is the only genuinely relaxed parameter: it is required on the
183+
client but optional on the RunnerContext so the action can inject it.
184+
``run_name`` and ``_dataset_version`` are already ``Optional`` on the
185+
client and must match as-is. ``name`` is required on both — the action
186+
supports a directory of experiments, so each script must name itself.
177187
"""
178188

179-
RELAXED_PARAMS = {"name", "run_name", "data", "_dataset_version"}
189+
RELAXED_PARAMS = {"data"}
180190

181191
# `CompositeEvaluatorFunction` is only imported under TYPE_CHECKING in
182192
# ``langfuse.experiment`` to break the circular dependency with

0 commit comments

Comments
 (0)