Skip to content

Commit 43cfd57

Browse files
wochingeclaude
andcommitted
refactor(experiment): move RunnerContext and RegressionError into experiment module
Relocates the CI-action primitives from the standalone `langfuse/ci.py` module into `langfuse/experiment.py` alongside the other experiment types. Deletes `langfuse/ci.py` and renames the tests accordingly. The public import paths (`from langfuse import RunnerContext, RegressionError`) are unchanged. `CompositeEvaluatorFunction` is imported under `TYPE_CHECKING` to avoid a circular import with `langfuse.batch_evaluation`. The signature-drift guard now resolves the forward reference via `typing.get_type_hints(..., localns=...)`. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent 8ee0576 commit 43cfd57

4 files changed

Lines changed: 183 additions & 192 deletions

File tree

langfuse/__init__.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,7 @@
88
EvaluatorStats,
99
MapperFunction,
1010
)
11-
from langfuse.ci import RegressionError, RunnerContext
12-
from langfuse.experiment import Evaluation
11+
from langfuse.experiment import Evaluation, RegressionError, RunnerContext
1312

1413
from ._client import client as _client_module
1514
from ._client.attributes import LangfuseOtelSpanAttributes

langfuse/ci.py

Lines changed: 0 additions & 166 deletions
This file was deleted.

langfuse/experiment.py

Lines changed: 151 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,9 @@
66
"""
77

88
import asyncio
9+
from datetime import datetime
910
from typing import (
11+
TYPE_CHECKING,
1012
Any,
1113
Awaitable,
1214
Dict,
@@ -21,6 +23,10 @@
2123
from langfuse.logger import langfuse_logger as logger
2224
from langfuse.types import ExperimentScoreType
2325

26+
if TYPE_CHECKING:
27+
from langfuse._client.client import Langfuse
28+
from langfuse.batch_evaluation import CompositeEvaluatorFunction
29+
2430

2531
class LocalExperimentItem(TypedDict, total=False):
2632
"""Structure for local experiment data items (not from Langfuse datasets).
@@ -1049,3 +1055,148 @@ def langfuse_evaluator(
10491055
)
10501056

10511057
return langfuse_evaluator
1058+
1059+
1060+
class RunnerContext:
1061+
"""Wraps :meth:`Langfuse.run_experiment` with CI-injected defaults.
1062+
1063+
Intended for use with the ``langfuse/experiment-action`` GitHub Action
1064+
(https://github.com/langfuse/experiment-action). The action builds a
1065+
``RunnerContext`` before invoking the user's ``experiment(context)``
1066+
function. Defaults set here (dataset, name, run name, metadata tags) are
1067+
applied when the user omits them on the :meth:`run_experiment` call;
1068+
users can override any default by passing the corresponding argument
1069+
explicitly.
1070+
"""
1071+
1072+
def __init__(
1073+
self,
1074+
*,
1075+
client: "Langfuse",
1076+
data: Optional[ExperimentData] = None,
1077+
dataset_version: Optional[datetime] = None,
1078+
name: Optional[str] = None,
1079+
run_name: Optional[str] = None,
1080+
metadata: Optional[Dict[str, str]] = None,
1081+
):
1082+
"""Build a ``RunnerContext`` populated with defaults for ``run_experiment``.
1083+
1084+
Typically called by the ``langfuse/experiment-action`` GitHub Action,
1085+
not by end users directly. Every field except ``client`` is optional:
1086+
fields left as ``None`` simply mean the corresponding argument must be
1087+
supplied on the :meth:`run_experiment` call.
1088+
1089+
Args:
1090+
client: Initialized Langfuse SDK client used to execute the
1091+
experiment. The action creates this from the
1092+
``langfuse_public_key`` / ``langfuse_secret_key`` /
1093+
``langfuse_base_url`` inputs.
1094+
data: Default dataset items to run the experiment on. Accepts
1095+
either ``List[LocalExperimentItem]`` or ``List[DatasetItem]``.
1096+
Injected by the action when ``dataset_name`` is configured.
1097+
If ``None``, the user must pass ``data=`` to
1098+
:meth:`run_experiment`.
1099+
dataset_version: Optional pinned dataset version. Injected by the
1100+
action when ``dataset_version`` is configured.
1101+
name: Default human-readable experiment name (e.g. the action's
1102+
``experiment_name`` input). If ``None``, the user must pass
1103+
``name=`` to :meth:`run_experiment`.
1104+
run_name: Default exact run name. The action typically derives
1105+
this from the commit SHA / PR number so that reruns produce
1106+
distinct runs in Langfuse.
1107+
metadata: Default metadata attached to every experiment trace and
1108+
the dataset run. The action injects GitHub-sourced tags (SHA,
1109+
PR link, workflow run link, branch, GH user, etc.). Merged
1110+
with any ``metadata`` passed to :meth:`run_experiment`, with
1111+
user-supplied keys winning on collision.
1112+
"""
1113+
self.client = client
1114+
self.data = data
1115+
self.dataset_version = dataset_version
1116+
self.name = name
1117+
self.run_name = run_name
1118+
self.metadata = metadata
1119+
1120+
def run_experiment(
1121+
self,
1122+
*,
1123+
name: Optional[str] = None,
1124+
run_name: Optional[str] = None,
1125+
description: Optional[str] = None,
1126+
data: Optional[ExperimentData] = None,
1127+
task: TaskFunction,
1128+
evaluators: List[EvaluatorFunction] = [],
1129+
composite_evaluator: Optional["CompositeEvaluatorFunction"] = None,
1130+
run_evaluators: List[RunEvaluatorFunction] = [],
1131+
max_concurrency: int = 50,
1132+
metadata: Optional[Dict[str, str]] = None,
1133+
_dataset_version: Optional[datetime] = None,
1134+
) -> ExperimentResult:
1135+
resolved_name = name if name is not None else self.name
1136+
if resolved_name is None:
1137+
raise ValueError(
1138+
"`name` must be provided either on the RunnerContext or the run_experiment call"
1139+
)
1140+
1141+
resolved_data = data if data is not None else self.data
1142+
if resolved_data is None:
1143+
raise ValueError(
1144+
"`data` must be provided either on the RunnerContext or the run_experiment call"
1145+
)
1146+
1147+
resolved_run_name = run_name if run_name is not None else self.run_name
1148+
resolved_dataset_version = (
1149+
_dataset_version if _dataset_version is not None else self.dataset_version
1150+
)
1151+
1152+
merged_metadata: Optional[Dict[str, str]]
1153+
if self.metadata is None and metadata is None:
1154+
merged_metadata = None
1155+
else:
1156+
merged_metadata = {**(self.metadata or {}), **(metadata or {})}
1157+
1158+
return self.client.run_experiment(
1159+
name=resolved_name,
1160+
run_name=resolved_run_name,
1161+
description=description,
1162+
data=resolved_data,
1163+
task=task,
1164+
evaluators=evaluators,
1165+
composite_evaluator=composite_evaluator,
1166+
run_evaluators=run_evaluators,
1167+
max_concurrency=max_concurrency,
1168+
metadata=merged_metadata,
1169+
_dataset_version=resolved_dataset_version,
1170+
)
1171+
1172+
1173+
class RegressionError(Exception):
1174+
"""Raised by a user's ``experiment`` function to signal a CI gate failure.
1175+
1176+
Intended for use with the ``langfuse/experiment-action`` GitHub Action
1177+
(https://github.com/langfuse/experiment-action). The action catches this
1178+
exception and, when ``should_fail_on_error`` is enabled, fails the
1179+
workflow run and renders a callout in the PR comment using
1180+
``metric``/``value``/``threshold`` if supplied, otherwise ``str(exc)``.
1181+
"""
1182+
1183+
def __init__(
1184+
self,
1185+
*,
1186+
result: ExperimentResult,
1187+
metric: Optional[str] = None,
1188+
value: Optional[float] = None,
1189+
threshold: Optional[float] = None,
1190+
message: Optional[str] = None,
1191+
):
1192+
self.result = result
1193+
self.metric = metric
1194+
self.value = value
1195+
self.threshold = threshold
1196+
if message is not None:
1197+
formatted = message
1198+
elif metric is not None:
1199+
formatted = f"Regression on `{metric}`: {value} (threshold {threshold})"
1200+
else:
1201+
formatted = "Experiment regression detected"
1202+
super().__init__(formatted)

0 commit comments

Comments
 (0)