Skip to content

Commit e4a4599

Browse files
committed
move to classes
1 parent 009c191 commit e4a4599

4 files changed

Lines changed: 527 additions & 361 deletions

File tree

langfuse/_client/client.py

Lines changed: 38 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -2543,11 +2543,11 @@ def length_evaluator(*, input, output, expected_output=None, **kwargs):
25432543
evaluators=[length_evaluator]
25442544
)
25452545
2546-
print(f"Processed {len(result['item_results'])} items")
2547-
for item_result in result["item_results"]:
2548-
print(f"Input: {item_result['item']['input']}")
2549-
print(f"Output: {item_result['output']}")
2550-
print(f"Evaluations: {item_result['evaluations']}")
2546+
print(f"Processed {len(result.item_results)} items")
2547+
for item_result in result.item_results:
2548+
print(f"Input: {item_result.item['input']}")
2549+
print(f"Output: {item_result.output}")
2550+
print(f"Evaluations: {item_result.evaluations}")
25512551
```
25522552
25532553
Advanced experiment with async task and multiple evaluators:
@@ -2576,9 +2576,9 @@ def toxicity_evaluator(*, input, output, expected_output=None, **kwargs):
25762576
25772577
def average_accuracy(*, item_results, **kwargs):
25782578
accuracies = [
2579-
eval["value"] for result in item_results
2580-
for eval in result["evaluations"]
2581-
if eval["name"] == "accuracy"
2579+
eval.value for result in item_results
2580+
for eval in result.evaluations
2581+
if eval.name == "accuracy"
25822582
]
25832583
return {
25842584
"name": "average_accuracy",
@@ -2656,7 +2656,7 @@ async def _run_experiment_async(
26562656
semaphore = asyncio.Semaphore(max_concurrency)
26572657

26582658
# Process all items
2659-
async def process_item(item: ExperimentItem) -> dict:
2659+
async def process_item(item: ExperimentItem) -> ExperimentItemResult:
26602660
async with semaphore:
26612661
return await self._process_experiment_item(
26622662
item, task, evaluators, name, description, metadata
@@ -2671,7 +2671,7 @@ async def process_item(item: ExperimentItem) -> dict:
26712671
for i, result in enumerate(item_results):
26722672
if isinstance(result, Exception):
26732673
langfuse_logger.error(f"Item {i} failed: {result}")
2674-
elif isinstance(result, dict):
2674+
elif isinstance(result, ExperimentItemResult):
26752675
valid_results.append(result) # type: ignore
26762676

26772677
# Run experiment-level evaluators
@@ -2686,9 +2686,7 @@ async def process_item(item: ExperimentItem) -> dict:
26862686
langfuse_logger.error(f"Run evaluator failed: {e}")
26872687

26882688
# Generate dataset run URL if applicable
2689-
dataset_run_id = (
2690-
valid_results[0].get("dataset_run_id") if valid_results else None
2691-
)
2689+
dataset_run_id = valid_results[0].dataset_run_id if valid_results else None
26922690
dataset_run_url = None
26932691
if dataset_run_id and data:
26942692
try:
@@ -2714,11 +2712,11 @@ async def process_item(item: ExperimentItem) -> dict:
27142712
if dataset_run_id:
27152713
self.create_score(
27162714
dataset_run_id=dataset_run_id,
2717-
name=evaluation.get("name") or "<unknown>",
2718-
value=evaluation.get("value"), # type: ignore
2719-
comment=evaluation.get("comment"),
2720-
metadata=evaluation.get("metadata"),
2721-
data_type=evaluation.get("data_type"), # type: ignore
2715+
name=evaluation.name or "<unknown>",
2716+
value=evaluation.value, # type: ignore
2717+
comment=evaluation.comment,
2718+
metadata=evaluation.metadata,
2719+
data_type=evaluation.data_type, # type: ignore
27222720
)
27232721

27242722
except Exception as e:
@@ -2727,14 +2725,14 @@ async def process_item(item: ExperimentItem) -> dict:
27272725
# Flush scores and traces
27282726
self.flush()
27292727

2730-
return {
2731-
"name": name,
2732-
"description": description,
2733-
"item_results": valid_results,
2734-
"run_evaluations": run_evaluations,
2735-
"dataset_run_id": dataset_run_id,
2736-
"dataset_run_url": dataset_run_url,
2737-
}
2728+
return ExperimentResult(
2729+
name=name,
2730+
description=description,
2731+
item_results=valid_results,
2732+
run_evaluations=run_evaluations,
2733+
dataset_run_id=dataset_run_id,
2734+
dataset_run_url=dataset_run_url,
2735+
)
27382736

27392737
async def _process_experiment_item(
27402738
self,
@@ -2744,7 +2742,7 @@ async def _process_experiment_item(
27442742
experiment_name: str,
27452743
experiment_description: Optional[str],
27462744
experiment_metadata: Dict[str, Any],
2747-
) -> dict:
2745+
) -> ExperimentItemResult:
27482746
# Execute task with tracing
27492747
span_name = "experiment-item-run"
27502748

@@ -2842,22 +2840,24 @@ async def _process_experiment_item(
28422840
for evaluation in eval_results:
28432841
self.create_score(
28442842
trace_id=trace_id,
2845-
name=evaluation.get("name", "unknown"),
2846-
value=evaluation.get("value", -1), # type: ignore
2847-
comment=evaluation.get("comment"),
2848-
metadata=evaluation.get("metadata"),
2843+
name=evaluation.name or "unknown",
2844+
value=evaluation.value
2845+
if evaluation.value is not None
2846+
else -1, # type: ignore
2847+
comment=evaluation.comment,
2848+
metadata=evaluation.metadata,
28492849
)
28502850

28512851
except Exception as e:
28522852
langfuse_logger.error(f"Evaluator failed: {e}")
28532853

2854-
return {
2855-
"item": item,
2856-
"output": output,
2857-
"evaluations": evaluations,
2858-
"trace_id": trace_id,
2859-
"dataset_run_id": dataset_run_id,
2860-
}
2854+
return ExperimentItemResult(
2855+
item=item,
2856+
output=output,
2857+
evaluations=evaluations,
2858+
trace_id=trace_id,
2859+
dataset_run_id=dataset_run_id,
2860+
)
28612861

28622862
except Exception as e:
28632863
span.update(

langfuse/_client/datasets.py

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -237,12 +237,21 @@ def run_experiment(
237237
Will be combined with individual item metadata.
238238
239239
Returns:
240-
ExperimentResult dictionary containing:
240+
ExperimentResult object containing:
241+
- name: The experiment name
242+
- description: Optional experiment description
241243
- item_results: Results for each dataset item with outputs and evaluations
242244
- run_evaluations: Aggregate evaluation results for the entire run
243245
- dataset_run_id: ID of the created dataset run in Langfuse
244246
- dataset_run_url: Direct URL to view the experiment results in Langfuse UI
245247
248+
The result object provides a format() method for human-readable output:
249+
```python
250+
result = dataset.run_experiment(...)
251+
print(result.format()) # Summary view
252+
print(result.format(include_item_results=True)) # Detailed view
253+
```
254+
246255
Raises:
247256
ValueError: If the dataset has no items or no Langfuse client is available
248257
@@ -372,8 +381,8 @@ def content_diversity(*, item_results, **kwargs):
372381
373382
# Both experiments are now visible in Langfuse for easy comparison
374383
print("Compare results in Langfuse:")
375-
print(f"GPT-4: {result_gpt4['dataset_run_url']}")
376-
print(f"Custom: {result_custom['dataset_run_url']}")
384+
print(f"GPT-4: {result_gpt4.dataset_run_url}")
385+
print(f"Custom: {result_custom.dataset_run_url}")
377386
```
378387
379388
Note:

0 commit comments

Comments
 (0)