|
4 | 4 |
|
5 | 5 | from opentelemetry.util._decorator import _agnosticcontextmanager |
6 | 6 |
|
| 7 | +from langfuse.batch_evaluation import CompositeEvaluatorFunction |
7 | 8 | from langfuse.experiment import ( |
8 | 9 | EvaluatorFunction, |
9 | 10 | ExperimentResult, |
@@ -204,6 +205,7 @@ def run_experiment( |
204 | 205 | description: Optional[str] = None, |
205 | 206 | task: TaskFunction, |
206 | 207 | evaluators: List[EvaluatorFunction] = [], |
| 208 | + composite_evaluator: Optional[CompositeEvaluatorFunction] = None, |
207 | 209 | run_evaluators: List[RunEvaluatorFunction] = [], |
208 | 210 | max_concurrency: int = 50, |
209 | 211 | metadata: Optional[Dict[str, Any]] = None, |
@@ -234,6 +236,10 @@ def run_experiment( |
234 | 236 | .metadata attributes. Signature should be: task(*, item, **kwargs) -> Any |
235 | 237 | evaluators: List of functions to evaluate each item's output individually. |
236 | 238 | These will have access to the item's expected_output for comparison. |
| 239 | + composite_evaluator: Optional function that creates composite scores from item-level evaluations. |
| 240 | + Receives the same inputs as item-level evaluators (input, output, expected_output, metadata) |
| 241 | + plus the list of evaluations from item-level evaluators. Useful for weighted averages, |
| 242 | + pass/fail decisions based on multiple criteria, or custom scoring logic combining multiple metrics. |
237 | 243 | run_evaluators: List of functions to evaluate the entire experiment run. |
238 | 244 | Useful for computing aggregate statistics across all dataset items. |
239 | 245 | max_concurrency: Maximum number of concurrent task executions (default: 50). |
@@ -411,6 +417,7 @@ def content_diversity(*, item_results, **kwargs): |
411 | 417 | data=self.items, |
412 | 418 | task=task, |
413 | 419 | evaluators=evaluators, |
| 420 | + composite_evaluator=composite_evaluator, |
414 | 421 | run_evaluators=run_evaluators, |
415 | 422 | max_concurrency=max_concurrency, |
416 | 423 | metadata=metadata, |
|
0 commit comments