@@ -594,6 +594,7 @@ class BatchEvaluationResult:
594594 failed_item_ids: List of IDs for items that failed evaluation.
595595 error_summary: Dictionary mapping error types to occurrence counts.
596596 has_more_items: True if max_items limit was reached but more items exist.
597+ item_evaluations: Dictionary mapping item IDs to their evaluation results (both regular and composite).
597598
598599 Examples:
599600 Basic result inspection:
@@ -690,6 +691,7 @@ def __init__(
690691 failed_item_ids : List [str ],
691692 error_summary : Dict [str , int ],
692693 has_more_items : bool ,
694+ item_evaluations : Dict [str , List ["Evaluation" ]],
693695 ):
694696 """Initialize BatchEvaluationResult with comprehensive statistics.
695697
@@ -707,6 +709,7 @@ def __init__(
707709 failed_item_ids: IDs of failed items.
708710 error_summary: Error types and counts.
709711 has_more_items: Whether more items exist beyond max_items.
712+ item_evaluations: Dictionary mapping item IDs to their evaluation results.
710713
711714 Note:
712715 All arguments must be provided as keywords.
@@ -724,6 +727,7 @@ def __init__(
724727 self .failed_item_ids = failed_item_ids
725728 self .error_summary = error_summary
726729 self .has_more_items = has_more_items
730+ self .item_evaluations = item_evaluations
727731
728732 def __str__ (self ) -> str :
729733 """Return a formatted string representation of the batch evaluation results.
@@ -884,6 +888,7 @@ async def run_async(
884888 total_evaluations_failed = 0
885889 failed_item_ids : List [str ] = []
886890 error_summary : Dict [str , int ] = {}
891+ item_evaluations : Dict [str , List [Evaluation ]] = {}
887892
888893 # Initialize evaluator stats
889894 evaluator_stats_dict = {
@@ -958,6 +963,7 @@ async def run_async(
958963 failed_item_ids = failed_item_ids ,
959964 error_summary = error_summary ,
960965 has_more_items = has_more ,
966+ item_evaluations = item_evaluations ,
961967 )
962968
963969 # Check if we got any items
@@ -987,7 +993,7 @@ async def run_async(
987993 # Process items concurrently
988994 async def process_item (
989995 item : Union [TraceWithFullDetails , ObservationsView ],
990- ) -> Tuple [str , Union [Tuple [int , int , int ], Exception ]]:
996+ ) -> Tuple [str , Union [Tuple [int , int , int , List [ Evaluation ] ], Exception ]]:
991997 """Process a single item and return (item_id, result)."""
992998 async with semaphore :
993999 item_id = self ._get_item_id (item , scope )
@@ -1021,11 +1027,16 @@ async def process_item(
10211027 else :
10221028 # Item processed successfully
10231029 total_items_processed += 1
1024- scores_created , composite_created , evals_failed = result
1030+ scores_created , composite_created , evals_failed , evaluations = (
1031+ result
1032+ )
10251033 total_scores_created += scores_created
10261034 total_composite_scores_created += composite_created
10271035 total_evaluations_failed += evals_failed
10281036
1037+ # Store evaluations for this item
1038+ item_evaluations [item_id ] = evaluations
1039+
10291040 # Update last processed tracking
10301041 last_item_timestamp = self ._get_item_timestamp (item , scope )
10311042 last_item_id = item_id
@@ -1092,6 +1103,7 @@ async def process_item(
10921103 has_more_items = (
10931104 has_more and max_items is not None and total_items_fetched >= max_items
10941105 ),
1106+ item_evaluations = item_evaluations ,
10951107 )
10961108
10971109 async def _fetch_batch_with_retry (
@@ -1148,7 +1160,7 @@ async def _process_batch_evaluation_item(
11481160 composite_evaluator : Optional [CompositeEvaluatorFunction ],
11491161 metadata : Optional [Dict [str , Any ]],
11501162 evaluator_stats_dict : Dict [str , EvaluatorStats ],
1151- ) -> Tuple [int , int , int ]:
1163+ ) -> Tuple [int , int , int , List [ Evaluation ] ]:
11521164 """Process a single item: map, evaluate, create scores.
11531165
11541166 Args:
@@ -1161,7 +1173,7 @@ async def _process_batch_evaluation_item(
11611173 evaluator_stats_dict: Dictionary tracking evaluator statistics.
11621174
11631175 Returns:
1164- Tuple of (scores_created, composite_scores_created, evaluations_failed).
1176+ Tuple of (scores_created, composite_scores_created, evaluations_failed, all_evaluations ).
11651177
11661178 Raises:
11671179 Exception: If mapping fails or item processing encounters fatal error.
@@ -1235,10 +1247,18 @@ async def _process_batch_evaluation_item(
12351247 )
12361248 composite_scores_created += 1
12371249
1250+ # Add composite evaluations to the list
1251+ evaluations .extend (composite_evals )
1252+
12381253 except Exception as e :
12391254 self ._log .warning (f"Composite evaluator failed on item { item_id } : { e } " )
12401255
1241- return (scores_created , composite_scores_created , evaluations_failed )
1256+ return (
1257+ scores_created ,
1258+ composite_scores_created ,
1259+ evaluations_failed ,
1260+ evaluations ,
1261+ )
12421262
12431263 async def _run_evaluator_internal (
12441264 self ,
@@ -1495,6 +1515,7 @@ def _build_result(
14951515 failed_item_ids : List [str ],
14961516 error_summary : Dict [str , int ],
14971517 has_more_items : bool ,
1518+ item_evaluations : Dict [str , List [Evaluation ]],
14981519 ) -> BatchEvaluationResult :
14991520 """Build the final BatchEvaluationResult.
15001521
@@ -1512,6 +1533,7 @@ def _build_result(
15121533 failed_item_ids: IDs of failed items.
15131534 error_summary: Error type counts.
15141535 has_more_items: Whether more items exist.
1536+ item_evaluations: Dictionary mapping item IDs to their evaluation results.
15151537
15161538 Returns:
15171539 BatchEvaluationResult instance.
@@ -1532,4 +1554,5 @@ def _build_result(
15321554 failed_item_ids = failed_item_ids ,
15331555 error_summary = error_summary ,
15341556 has_more_items = has_more_items ,
1557+ item_evaluations = item_evaluations ,
15351558 )
0 commit comments