|
6 | 6 | """ |
7 | 7 |
|
8 | 8 | import asyncio |
| 9 | +from datetime import datetime |
9 | 10 | from typing import ( |
| 11 | + TYPE_CHECKING, |
10 | 12 | Any, |
11 | 13 | Awaitable, |
12 | 14 | Dict, |
|
21 | 23 | from langfuse.logger import langfuse_logger as logger |
22 | 24 | from langfuse.types import ExperimentScoreType |
23 | 25 |
|
| 26 | +if TYPE_CHECKING: |
| 27 | + from langfuse._client.client import Langfuse |
| 28 | + from langfuse.batch_evaluation import CompositeEvaluatorFunction |
| 29 | + |
24 | 30 |
|
25 | 31 | class LocalExperimentItem(TypedDict, total=False): |
26 | 32 | """Structure for local experiment data items (not from Langfuse datasets). |
@@ -1049,3 +1055,148 @@ def langfuse_evaluator( |
1049 | 1055 | ) |
1050 | 1056 |
|
1051 | 1057 | return langfuse_evaluator |
| 1058 | + |
| 1059 | + |
| 1060 | +class RunnerContext: |
| 1061 | + """Wraps :meth:`Langfuse.run_experiment` with CI-injected defaults. |
| 1062 | +
|
| 1063 | + Intended for use with the ``langfuse/experiment-action`` GitHub Action |
| 1064 | + (https://github.com/langfuse/experiment-action). The action builds a |
| 1065 | + ``RunnerContext`` before invoking the user's ``experiment(context)`` |
| 1066 | + function. Defaults set here (dataset, name, run name, metadata tags) are |
| 1067 | + applied when the user omits them on the :meth:`run_experiment` call; |
| 1068 | + users can override any default by passing the corresponding argument |
| 1069 | + explicitly. |
| 1070 | + """ |
| 1071 | + |
| 1072 | + def __init__( |
| 1073 | + self, |
| 1074 | + *, |
| 1075 | + client: "Langfuse", |
| 1076 | + data: Optional[ExperimentData] = None, |
| 1077 | + dataset_version: Optional[datetime] = None, |
| 1078 | + name: Optional[str] = None, |
| 1079 | + run_name: Optional[str] = None, |
| 1080 | + metadata: Optional[Dict[str, str]] = None, |
| 1081 | + ): |
| 1082 | + """Build a ``RunnerContext`` populated with defaults for ``run_experiment``. |
| 1083 | +
|
| 1084 | + Typically called by the ``langfuse/experiment-action`` GitHub Action, |
| 1085 | + not by end users directly. Every field except ``client`` is optional: |
| 1086 | + fields left as ``None`` simply mean the corresponding argument must be |
| 1087 | + supplied on the :meth:`run_experiment` call. |
| 1088 | +
|
| 1089 | + Args: |
| 1090 | + client: Initialized Langfuse SDK client used to execute the |
| 1091 | + experiment. The action creates this from the |
| 1092 | + ``langfuse_public_key`` / ``langfuse_secret_key`` / |
| 1093 | + ``langfuse_base_url`` inputs. |
| 1094 | + data: Default dataset items to run the experiment on. Accepts |
| 1095 | + either ``List[LocalExperimentItem]`` or ``List[DatasetItem]``. |
| 1096 | + Injected by the action when ``dataset_name`` is configured. |
| 1097 | + If ``None``, the user must pass ``data=`` to |
| 1098 | + :meth:`run_experiment`. |
| 1099 | + dataset_version: Optional pinned dataset version. Injected by the |
| 1100 | + action when ``dataset_version`` is configured. |
| 1101 | + name: Default human-readable experiment name (e.g. the action's |
| 1102 | + ``experiment_name`` input). If ``None``, the user must pass |
| 1103 | + ``name=`` to :meth:`run_experiment`. |
| 1104 | + run_name: Default exact run name. The action typically derives |
| 1105 | + this from the commit SHA / PR number so that reruns produce |
| 1106 | + distinct runs in Langfuse. |
| 1107 | + metadata: Default metadata attached to every experiment trace and |
| 1108 | + the dataset run. The action injects GitHub-sourced tags (SHA, |
| 1109 | + PR link, workflow run link, branch, GH user, etc.). Merged |
| 1110 | + with any ``metadata`` passed to :meth:`run_experiment`, with |
| 1111 | + user-supplied keys winning on collision. |
| 1112 | + """ |
| 1113 | + self.client = client |
| 1114 | + self.data = data |
| 1115 | + self.dataset_version = dataset_version |
| 1116 | + self.name = name |
| 1117 | + self.run_name = run_name |
| 1118 | + self.metadata = metadata |
| 1119 | + |
| 1120 | + def run_experiment( |
| 1121 | + self, |
| 1122 | + *, |
| 1123 | + name: Optional[str] = None, |
| 1124 | + run_name: Optional[str] = None, |
| 1125 | + description: Optional[str] = None, |
| 1126 | + data: Optional[ExperimentData] = None, |
| 1127 | + task: TaskFunction, |
| 1128 | + evaluators: List[EvaluatorFunction] = [], |
| 1129 | + composite_evaluator: Optional["CompositeEvaluatorFunction"] = None, |
| 1130 | + run_evaluators: List[RunEvaluatorFunction] = [], |
| 1131 | + max_concurrency: int = 50, |
| 1132 | + metadata: Optional[Dict[str, str]] = None, |
| 1133 | + _dataset_version: Optional[datetime] = None, |
| 1134 | + ) -> ExperimentResult: |
| 1135 | + resolved_name = name if name is not None else self.name |
| 1136 | + if resolved_name is None: |
| 1137 | + raise ValueError( |
| 1138 | + "`name` must be provided either on the RunnerContext or the run_experiment call" |
| 1139 | + ) |
| 1140 | + |
| 1141 | + resolved_data = data if data is not None else self.data |
| 1142 | + if resolved_data is None: |
| 1143 | + raise ValueError( |
| 1144 | + "`data` must be provided either on the RunnerContext or the run_experiment call" |
| 1145 | + ) |
| 1146 | + |
| 1147 | + resolved_run_name = run_name if run_name is not None else self.run_name |
| 1148 | + resolved_dataset_version = ( |
| 1149 | + _dataset_version if _dataset_version is not None else self.dataset_version |
| 1150 | + ) |
| 1151 | + |
| 1152 | + merged_metadata: Optional[Dict[str, str]] |
| 1153 | + if self.metadata is None and metadata is None: |
| 1154 | + merged_metadata = None |
| 1155 | + else: |
| 1156 | + merged_metadata = {**(self.metadata or {}), **(metadata or {})} |
| 1157 | + |
| 1158 | + return self.client.run_experiment( |
| 1159 | + name=resolved_name, |
| 1160 | + run_name=resolved_run_name, |
| 1161 | + description=description, |
| 1162 | + data=resolved_data, |
| 1163 | + task=task, |
| 1164 | + evaluators=evaluators, |
| 1165 | + composite_evaluator=composite_evaluator, |
| 1166 | + run_evaluators=run_evaluators, |
| 1167 | + max_concurrency=max_concurrency, |
| 1168 | + metadata=merged_metadata, |
| 1169 | + _dataset_version=resolved_dataset_version, |
| 1170 | + ) |
| 1171 | + |
| 1172 | + |
| 1173 | +class RegressionError(Exception): |
| 1174 | + """Raised by a user's ``experiment`` function to signal a CI gate failure. |
| 1175 | +
|
| 1176 | + Intended for use with the ``langfuse/experiment-action`` GitHub Action |
| 1177 | + (https://github.com/langfuse/experiment-action). The action catches this |
| 1178 | + exception and, when ``should_fail_on_error`` is enabled, fails the |
| 1179 | + workflow run and renders a callout in the PR comment using |
| 1180 | + ``metric``/``value``/``threshold`` if supplied, otherwise ``str(exc)``. |
| 1181 | + """ |
| 1182 | + |
| 1183 | + def __init__( |
| 1184 | + self, |
| 1185 | + *, |
| 1186 | + result: ExperimentResult, |
| 1187 | + metric: Optional[str] = None, |
| 1188 | + value: Optional[float] = None, |
| 1189 | + threshold: Optional[float] = None, |
| 1190 | + message: Optional[str] = None, |
| 1191 | + ): |
| 1192 | + self.result = result |
| 1193 | + self.metric = metric |
| 1194 | + self.value = value |
| 1195 | + self.threshold = threshold |
| 1196 | + if message is not None: |
| 1197 | + formatted = message |
| 1198 | + elif metric is not None: |
| 1199 | + formatted = f"Regression on `{metric}`: {value} (threshold {threshold})" |
| 1200 | + else: |
| 1201 | + formatted = "Experiment regression detected" |
| 1202 | + super().__init__(formatted) |
0 commit comments