@@ -2472,39 +2472,151 @@ def run_experiment(
24722472 """Run an experiment on a dataset with automatic tracing and evaluation.
24732473
24742474 This method executes a task function on each item in the provided dataset,
2475- traces the execution with Langfuse, runs evaluators on the outputs,
2476- and returns formatted results.
2475+ automatically traces all executions with Langfuse for observability, runs
2476+ item-level and run-level evaluators on the outputs, and returns comprehensive
2477+ results with evaluation metrics.
2478+
2479+ The experiment system provides:
2480+ - Automatic tracing of all task executions
2481+ - Concurrent processing with configurable limits
2482+ - Comprehensive error handling that isolates failures
2483+ - Integration with Langfuse datasets for experiment tracking
2484+ - Flexible evaluation framework supporting both sync and async evaluators
24772485
24782486 Args:
2479- name: Human-readable name for the experiment
2480- description: Optional description of the experiment's purpose
2481- data: Array of data items to process (ExperimentItem or DatasetItem)
2482- task: Function that processes each data item and returns output
2483- evaluators: Optional list of functions to evaluate each item's output
2484- run_evaluators: Optional list of functions to evaluate the entire experiment
2485- max_concurrency: Maximum number of concurrent task executions
2486- metadata: Optional metadata to attach to the experiment
2487+ name: Human-readable name for the experiment. Used for identification
2488+ in the Langfuse UI and for dataset run naming if using Langfuse datasets.
2489+ description: Optional description explaining the experiment's purpose,
2490+ methodology, or expected outcomes.
2491+ data: Array of data items to process. Can be either:
2492+ - List of dict-like items with 'input', 'expected_output', 'metadata' keys
2493+ - List of Langfuse DatasetItem objects from dataset.items
2494+ task: Function that processes each data item and returns output.
2495+ Must accept 'item' as keyword argument and can return sync or async results.
2496+ The task function signature should be: task(*, item, **kwargs) -> Any
2497+ evaluators: List of functions to evaluate each item's output individually.
2498+ Each evaluator receives input, output, expected_output, and metadata.
2499+ Can return single Evaluation dict or list of Evaluation dicts.
2500+ run_evaluators: List of functions to evaluate the entire experiment run.
2501+ Each run evaluator receives all item_results and can compute aggregate metrics.
2502+ Useful for calculating averages, distributions, or cross-item comparisons.
2503+ max_concurrency: Maximum number of concurrent task executions (default: 50).
2504+ Controls the number of items processed simultaneously. Adjust based on
2505+ API rate limits and system resources.
2506+ metadata: Optional metadata dictionary to attach to all experiment traces.
2507+ This metadata will be included in every trace created during the experiment.
24872508
24882509 Returns:
2489- ExperimentResult containing item results, evaluations, and formatting functions
2510+ ExperimentResult dictionary containing:
2511+ - item_results: List of results for each processed item with outputs and evaluations
2512+ - run_evaluations: List of aggregate evaluation results for the entire run
2513+ - dataset_run_id: ID of the dataset run (if using Langfuse datasets)
2514+ - dataset_run_url: Direct URL to view results in Langfuse UI (if applicable)
24902515
2491- Example:
2516+ Raises:
2517+ ValueError: If required parameters are missing or invalid
2518+ Exception: If experiment setup fails (individual item failures are handled gracefully)
2519+
2520+ Examples:
2521+ Basic experiment with local data:
24922522 ```python
2493- def task( item):
2494- return f"Processed : {item['input']} "
2523+ def summarize_text(*, item, **kwargs ):
2524+ return f"Summary : {item['input'][:50]}... "
24952525
2496- def evaluator(*, input, output, expected_output=None, **kwargs):
2497- return {"name": "length", "value": len(output)}
2526+ def length_evaluator(*, input, output, expected_output=None, **kwargs):
2527+ return {
2528+ "name": "output_length",
2529+ "value": len(output),
2530+ "comment": f"Output contains {len(output)} characters"
2531+ }
24982532
24992533 result = langfuse.run_experiment(
2500- name="Test Experiment",
2501- data=[{"input": "test", "expected_output": "expected"}],
2502- task=task,
2503- evaluators=[evaluator]
2534+ name="Text Summarization Test",
2535+ description="Evaluate summarization quality and length",
2536+ data=[
2537+ {"input": "Long article text...", "expected_output": "Expected summary"},
2538+ {"input": "Another article...", "expected_output": "Another summary"}
2539+ ],
2540+ task=summarize_text,
2541+ evaluators=[length_evaluator]
25042542 )
25052543
2506- print(result["item_results"])
2544+ print(f"Processed {len(result['item_results'])} items")
2545+ for item_result in result["item_results"]:
2546+ print(f"Input: {item_result['item']['input']}")
2547+ print(f"Output: {item_result['output']}")
2548+ print(f"Evaluations: {item_result['evaluations']}")
25072549 ```
2550+
2551+ Advanced experiment with async task and multiple evaluators:
2552+ ```python
2553+ async def llm_task(*, item, **kwargs):
2554+ # Simulate async LLM call
2555+ response = await openai_client.chat.completions.create(
2556+ model="gpt-4",
2557+ messages=[{"role": "user", "content": item["input"]}]
2558+ )
2559+ return response.choices[0].message.content
2560+
2561+ def accuracy_evaluator(*, input, output, expected_output=None, **kwargs):
2562+ if expected_output and expected_output.lower() in output.lower():
2563+ return {"name": "accuracy", "value": 1.0, "comment": "Correct answer"}
2564+ return {"name": "accuracy", "value": 0.0, "comment": "Incorrect answer"}
2565+
2566+ def toxicity_evaluator(*, input, output, expected_output=None, **kwargs):
2567+ # Simulate toxicity check
2568+ toxicity_score = check_toxicity(output) # Your toxicity checker
2569+ return {
2570+ "name": "toxicity",
2571+ "value": toxicity_score,
2572+ "comment": f"Toxicity level: {'high' if toxicity_score > 0.7 else 'low'}"
2573+ }
2574+
2575+ def average_accuracy(*, item_results, **kwargs):
2576+ accuracies = [
2577+ eval["value"] for result in item_results
2578+ for eval in result["evaluations"]
2579+ if eval["name"] == "accuracy"
2580+ ]
2581+ return {
2582+ "name": "average_accuracy",
2583+ "value": sum(accuracies) / len(accuracies) if accuracies else 0,
2584+ "comment": f"Average accuracy across {len(accuracies)} items"
2585+ }
2586+
2587+ result = langfuse.run_experiment(
2588+ name="LLM Safety and Accuracy Test",
2589+ description="Evaluate model accuracy and safety across diverse prompts",
2590+ data=test_dataset, # Your dataset items
2591+ task=llm_task,
2592+ evaluators=[accuracy_evaluator, toxicity_evaluator],
2593+ run_evaluators=[average_accuracy],
2594+ max_concurrency=5, # Limit concurrent API calls
2595+ metadata={"model": "gpt-4", "temperature": 0.7}
2596+ )
2597+ ```
2598+
2599+ Using with Langfuse datasets:
2600+ ```python
2601+ # Get dataset from Langfuse
2602+ dataset = langfuse.get_dataset("my-eval-dataset")
2603+
2604+ result = dataset.run_experiment(
2605+ name="Production Model Evaluation",
2606+ description="Monthly evaluation of production model performance",
2607+ task=my_production_task,
2608+ evaluators=[accuracy_evaluator, latency_evaluator]
2609+ )
2610+
2611+ # Results automatically linked to dataset in Langfuse UI
2612+ print(f"View results: {result['dataset_run_url']}")
2613+ ```
2614+
2615+ Note:
2616+ - Task and evaluator functions can be either synchronous or asynchronous
2617+ - Individual item failures are logged but don't stop the experiment
2618+ - All executions are automatically traced and visible in Langfuse UI
2619+ - When using Langfuse datasets, results are automatically linked for easy comparison
25082620 """
25092621 return asyncio .run (
25102622 self ._run_experiment_async (
@@ -2596,7 +2708,7 @@ async def process_item(item: ExperimentItem) -> dict:
25962708 self .create_score (
25972709 dataset_run_id = dataset_run_id ,
25982710 name = evaluation ["name" ],
2599- value = evaluation ["value" ],
2711+ value = evaluation ["value" ], # type: ignore
26002712 comment = evaluation .get ("comment" ),
26012713 metadata = evaluation .get ("metadata" ),
26022714 )
@@ -2718,7 +2830,7 @@ async def _process_experiment_item(
27182830 self .create_score (
27192831 trace_id = trace_id ,
27202832 name = evaluation .get ("name" , "unknown" ),
2721- value = evaluation .get ("value" , - 1 ),
2833+ value = evaluation .get ("value" , - 1 ), # type: ignore
27222834 comment = evaluation .get ("comment" ),
27232835 metadata = evaluation .get ("metadata" ),
27242836 )
0 commit comments