@@ -2543,11 +2543,11 @@ def length_evaluator(*, input, output, expected_output=None, **kwargs):
25432543 evaluators=[length_evaluator]
25442544 )
25452545
2546- print(f"Processed {len(result[' item_results'] )} items")
2547- for item_result in result[" item_results"] :
2548- print(f"Input: {item_result[' item'] ['input']}")
2549- print(f"Output: {item_result[' output'] }")
2550- print(f"Evaluations: {item_result[' evaluations'] }")
2546+ print(f"Processed {len(result. item_results)} items")
2547+ for item_result in result. item_results:
2548+ print(f"Input: {item_result. item['input']}")
2549+ print(f"Output: {item_result. output}")
2550+ print(f"Evaluations: {item_result. evaluations}")
25512551 ```
25522552
25532553 Advanced experiment with async task and multiple evaluators:
@@ -2576,9 +2576,9 @@ def toxicity_evaluator(*, input, output, expected_output=None, **kwargs):
25762576
25772577 def average_accuracy(*, item_results, **kwargs):
25782578 accuracies = [
2579- eval[" value"] for result in item_results
2580- for eval in result[" evaluations"]
2581- if eval[" name"] == "accuracy"
2579+ eval. value for result in item_results
2580+ for eval in result. evaluations
2581+ if eval. name == "accuracy"
25822582 ]
25832583 return {
25842584 "name": "average_accuracy",
@@ -2656,7 +2656,7 @@ async def _run_experiment_async(
26562656 semaphore = asyncio .Semaphore (max_concurrency )
26572657
26582658 # Process all items
2659- async def process_item (item : ExperimentItem ) -> dict :
2659+ async def process_item (item : ExperimentItem ) -> ExperimentItemResult :
26602660 async with semaphore :
26612661 return await self ._process_experiment_item (
26622662 item , task , evaluators , name , description , metadata
@@ -2671,7 +2671,7 @@ async def process_item(item: ExperimentItem) -> dict:
26712671 for i , result in enumerate (item_results ):
26722672 if isinstance (result , Exception ):
26732673 langfuse_logger .error (f"Item { i } failed: { result } " )
2674- elif isinstance (result , dict ):
2674+ elif isinstance (result , ExperimentItemResult ):
26752675 valid_results .append (result ) # type: ignore
26762676
26772677 # Run experiment-level evaluators
@@ -2686,9 +2686,7 @@ async def process_item(item: ExperimentItem) -> dict:
26862686 langfuse_logger .error (f"Run evaluator failed: { e } " )
26872687
26882688 # Generate dataset run URL if applicable
2689- dataset_run_id = (
2690- valid_results [0 ].get ("dataset_run_id" ) if valid_results else None
2691- )
2689+ dataset_run_id = valid_results [0 ].dataset_run_id if valid_results else None
26922690 dataset_run_url = None
26932691 if dataset_run_id and data :
26942692 try :
@@ -2714,11 +2712,11 @@ async def process_item(item: ExperimentItem) -> dict:
27142712 if dataset_run_id :
27152713 self .create_score (
27162714 dataset_run_id = dataset_run_id ,
2717- name = evaluation .get ( " name" ) or "<unknown>" ,
2718- value = evaluation .get ( " value" ) , # type: ignore
2719- comment = evaluation .get ( " comment" ) ,
2720- metadata = evaluation .get ( " metadata" ) ,
2721- data_type = evaluation .get ( " data_type" ) , # type: ignore
2715+ name = evaluation .name or "<unknown>" ,
2716+ value = evaluation .value , # type: ignore
2717+ comment = evaluation .comment ,
2718+ metadata = evaluation .metadata ,
2719+ data_type = evaluation .data_type , # type: ignore
27222720 )
27232721
27242722 except Exception as e :
@@ -2727,14 +2725,14 @@ async def process_item(item: ExperimentItem) -> dict:
27272725 # Flush scores and traces
27282726 self .flush ()
27292727
2730- return {
2731- " name" : name ,
2732- " description" : description ,
2733- " item_results" : valid_results ,
2734- " run_evaluations" : run_evaluations ,
2735- " dataset_run_id" : dataset_run_id ,
2736- " dataset_run_url" : dataset_run_url ,
2737- }
2728+ return ExperimentResult (
2729+ name = name ,
2730+ description = description ,
2731+ item_results = valid_results ,
2732+ run_evaluations = run_evaluations ,
2733+ dataset_run_id = dataset_run_id ,
2734+ dataset_run_url = dataset_run_url ,
2735+ )
27382736
27392737 async def _process_experiment_item (
27402738 self ,
@@ -2744,7 +2742,7 @@ async def _process_experiment_item(
27442742 experiment_name : str ,
27452743 experiment_description : Optional [str ],
27462744 experiment_metadata : Dict [str , Any ],
2747- ) -> dict :
2745+ ) -> ExperimentItemResult :
27482746 # Execute task with tracing
27492747 span_name = "experiment-item-run"
27502748
@@ -2842,22 +2840,24 @@ async def _process_experiment_item(
28422840 for evaluation in eval_results :
28432841 self .create_score (
28442842 trace_id = trace_id ,
2845- name = evaluation .get ("name" , "unknown" ),
2846- value = evaluation .get ("value" , - 1 ), # type: ignore
2847- comment = evaluation .get ("comment" ),
2848- metadata = evaluation .get ("metadata" ),
2843+ name = evaluation .name or "unknown" ,
2844+ value = evaluation .value
2845+ if evaluation .value is not None
2846+ else - 1 , # type: ignore
2847+ comment = evaluation .comment ,
2848+ metadata = evaluation .metadata ,
28492849 )
28502850
28512851 except Exception as e :
28522852 langfuse_logger .error (f"Evaluator failed: { e } " )
28532853
2854- return {
2855- " item" : item ,
2856- " output" : output ,
2857- " evaluations" : evaluations ,
2858- " trace_id" : trace_id ,
2859- " dataset_run_id" : dataset_run_id ,
2860- }
2854+ return ExperimentItemResult (
2855+ item = item ,
2856+ output = output ,
2857+ evaluations = evaluations ,
2858+ trace_id = trace_id ,
2859+ dataset_run_id = dataset_run_id ,
2860+ )
28612861
28622862 except Exception as e :
28632863 span .update (
0 commit comments