@@ -30,18 +30,15 @@ def test_context_defaults_flow_through(self):
3030 ctx = _make_ctx (
3131 data = ctx_data ,
3232 dataset_version = ctx_version ,
33- name = "ctx-name" ,
34- run_name = "ctx-run" ,
3533 metadata = {"sha" : "abc123" },
3634 )
3735
38- result = ctx .run_experiment (task = _noop_task )
36+ result = ctx .run_experiment (name = "exp" , task = _noop_task )
3937
4038 assert result == "result-sentinel"
4139 ctx .client .run_experiment .assert_called_once ()
4240 kwargs = ctx .client .run_experiment .call_args .kwargs
43- assert kwargs ["name" ] == "ctx-name"
44- assert kwargs ["run_name" ] == "ctx-run"
41+ assert kwargs ["name" ] == "exp"
4542 assert kwargs ["data" ] is ctx_data
4643 assert kwargs ["metadata" ] == {"sha" : "abc123" }
4744 assert kwargs ["_dataset_version" ] == ctx_version
@@ -51,22 +48,20 @@ def test_call_overrides_win(self):
5148 ctx = _make_ctx (
5249 data = [{"input" : "ctx" }],
5350 dataset_version = datetime (2026 , 1 , 1 ),
54- name = "ctx-name" ,
55- run_name = "ctx-run" ,
5651 )
5752
5853 override_data = [{"input" : "override" }]
5954 override_version = datetime (2026 , 6 , 6 )
6055 ctx .run_experiment (
56+ name = "exp" ,
6157 task = _noop_task ,
62- name = "call-name" ,
6358 run_name = "call-run" ,
6459 data = override_data ,
6560 _dataset_version = override_version ,
6661 )
6762
6863 kwargs = ctx .client .run_experiment .call_args .kwargs
69- assert kwargs ["name" ] == "call-name "
64+ assert kwargs ["name" ] == "exp "
7065 assert kwargs ["run_name" ] == "call-run"
7166 assert kwargs ["data" ] is override_data
7267 assert kwargs ["_dataset_version" ] == override_version
@@ -76,58 +71,52 @@ class TestRunnerContextMetadataMerge:
7671 def test_user_keys_win_on_collision (self ):
7772 ctx = _make_ctx (
7873 data = [{"input" : "a" }],
79- name = "n" ,
8074 metadata = {"sha" : "abc" , "branch" : "main" },
8175 )
82- ctx .run_experiment (task = _noop_task , metadata = {"sha" : "def" , "pr" : "42" })
76+ ctx .run_experiment (
77+ name = "exp" , task = _noop_task , metadata = {"sha" : "def" , "pr" : "42" }
78+ )
8379 assert ctx .client .run_experiment .call_args .kwargs ["metadata" ] == {
8480 "sha" : "def" ,
8581 "branch" : "main" ,
8682 "pr" : "42" ,
8783 }
8884
8985 def test_context_metadata_only (self ):
90- ctx = _make_ctx (
91- data = [{"input" : "a" }], name = "n" , metadata = {"sha" : "abc" }
92- )
93- ctx .run_experiment (task = _noop_task )
86+ ctx = _make_ctx (data = [{"input" : "a" }], metadata = {"sha" : "abc" })
87+ ctx .run_experiment (name = "exp" , task = _noop_task )
9488 assert ctx .client .run_experiment .call_args .kwargs ["metadata" ] == {"sha" : "abc" }
9589
9690 def test_call_metadata_only (self ):
97- ctx = _make_ctx (data = [{"input" : "a" }], name = "n" )
98- ctx .run_experiment (task = _noop_task , metadata = {"pr" : "1" })
91+ ctx = _make_ctx (data = [{"input" : "a" }])
92+ ctx .run_experiment (name = "exp" , task = _noop_task , metadata = {"pr" : "1" })
9993 assert ctx .client .run_experiment .call_args .kwargs ["metadata" ] == {"pr" : "1" }
10094
10195 def test_both_none_stays_none (self ):
102- ctx = _make_ctx (data = [{"input" : "a" }], name = "n" )
103- ctx .run_experiment (task = _noop_task )
96+ ctx = _make_ctx (data = [{"input" : "a" }])
97+ ctx .run_experiment (name = "exp" , task = _noop_task )
10498 assert ctx .client .run_experiment .call_args .kwargs ["metadata" ] is None
10599
106100
107101class TestRunnerContextLocalItems :
108102 def test_local_items_pass_through_as_context_default (self ):
109103 items = [{"input" : "x" , "expected_output" : "y" }]
110- ctx = _make_ctx (data = items , name = "n" )
111- ctx .run_experiment (task = _noop_task )
104+ ctx = _make_ctx (data = items )
105+ ctx .run_experiment (name = "exp" , task = _noop_task )
112106 assert ctx .client .run_experiment .call_args .kwargs ["data" ] is items
113107
114108 def test_local_items_pass_through_as_call_override (self ):
115- ctx = _make_ctx (name = "n" )
109+ ctx = _make_ctx ()
116110 items = [{"input" : "x" }]
117- ctx .run_experiment (task = _noop_task , data = items )
111+ ctx .run_experiment (name = "exp" , task = _noop_task , data = items )
118112 assert ctx .client .run_experiment .call_args .kwargs ["data" ] is items
119113
120114
121115class TestRunnerContextValidation :
122- def test_missing_name_raises (self ):
123- ctx = _make_ctx (data = [{"input" : "a" }])
124- with pytest .raises (ValueError , match = "name" ):
125- ctx .run_experiment (task = _noop_task )
126-
127116 def test_missing_data_raises (self ):
128- ctx = _make_ctx (name = "n" )
117+ ctx = _make_ctx ()
129118 with pytest .raises (ValueError , match = "data" ):
130- ctx .run_experiment (task = _noop_task )
119+ ctx .run_experiment (name = "exp" , task = _noop_task )
131120
132121
133122class TestRegressionError :
@@ -155,7 +144,14 @@ def test_structured_message(self):
155144 assert "0.78" in str (exc )
156145 assert "0.9" in str (exc )
157146
158- def test_user_message_wins (self ):
147+ def test_free_form_message (self ):
148+ exc = RegressionError (
149+ result = MagicMock (),
150+ message = "custom explanation" ,
151+ )
152+ assert str (exc ) == "custom explanation"
153+
154+ def test_message_wins_over_structured (self ):
159155 exc = RegressionError (
160156 result = MagicMock (),
161157 metric = "avg_accuracy" ,
@@ -164,19 +160,33 @@ def test_user_message_wins(self):
164160 message = "custom explanation" ,
165161 )
166162 assert str (exc ) == "custom explanation"
163+ assert exc .metric == "avg_accuracy"
164+ assert exc .value == 0.5
165+ assert exc .threshold == 0.9
166+
167+ def test_partial_structured_falls_back_to_default (self ):
168+ """The structured overload requires ``metric`` and ``value`` together.
169+
170+ If a caller bypasses the type checker and passes only one, we fall
171+ back to the default message rather than rendering misleading
172+ ``None`` placeholders in the PR comment.
173+ """
174+ exc = RegressionError (result = MagicMock (), metric = "avg_accuracy" ) # type: ignore[call-overload]
175+ assert str (exc ) == "Experiment regression detected"
167176
168177
169178class TestSignatureDriftGuard :
170179 """Fails loudly if ``Langfuse.run_experiment`` grows a parameter that is
171180 not threaded through ``RunnerContext.run_experiment``.
172181
173- The four action-relaxed params (``name``, ``run_name``, ``data``,
174- ``_dataset_version``) are allowed to diverge: the RunnerContext variant
175- must be the ``Optional[...]`` of the client annotation so the action can
176- inject them.
182+ ``data`` is the only genuinely relaxed parameter: it is required on the
183+ client but optional on the RunnerContext so the action can inject it.
184+ ``run_name`` and ``_dataset_version`` are already ``Optional`` on the
185+ client and must match as-is. ``name`` is required on both — the action
186+ supports a directory of experiments, so each script must name itself.
177187 """
178188
179- RELAXED_PARAMS = {"name" , "run_name" , " data" , "_dataset_version " }
189+ RELAXED_PARAMS = {"data" }
180190
181191 # `CompositeEvaluatorFunction` is only imported under TYPE_CHECKING in
182192 # ``langfuse.experiment`` to break the circular dependency with
0 commit comments