add item evaluations

hassiebp · hassiebp · commit 06a6e3775a8f · 2025-11-13T11:34:03.000+01:00
diff --git a/langfuse/batch_evaluation.py b/langfuse/batch_evaluation.py
@@ -594,6 +594,7 @@ class BatchEvaluationResult:
         failed_item_ids: List of IDs for items that failed evaluation.
         error_summary: Dictionary mapping error types to occurrence counts.
         has_more_items: True if max_items limit was reached but more items exist.
+        item_evaluations: Dictionary mapping item IDs to their evaluation results (both regular and composite).
 
     Examples:
         Basic result inspection:
@@ -690,6 +691,7 @@ def __init__(
         failed_item_ids: List[str],
         error_summary: Dict[str, int],
         has_more_items: bool,
+        item_evaluations: Dict[str, List["Evaluation"]],
     ):
         """Initialize BatchEvaluationResult with comprehensive statistics.
 
@@ -707,6 +709,7 @@ def __init__(
             failed_item_ids: IDs of failed items.
             error_summary: Error types and counts.
             has_more_items: Whether more items exist beyond max_items.
+            item_evaluations: Dictionary mapping item IDs to their evaluation results.
 
         Note:
             All arguments must be provided as keywords.
@@ -724,6 +727,7 @@ def __init__(
         self.failed_item_ids = failed_item_ids
         self.error_summary = error_summary
         self.has_more_items = has_more_items
+        self.item_evaluations = item_evaluations
 
     def __str__(self) -> str:
         """Return a formatted string representation of the batch evaluation results.
@@ -884,6 +888,7 @@ async def run_async(
         total_evaluations_failed = 0
         failed_item_ids: List[str] = []
         error_summary: Dict[str, int] = {}
+        item_evaluations: Dict[str, List[Evaluation]] = {}
 
         # Initialize evaluator stats
         evaluator_stats_dict = {
@@ -958,6 +963,7 @@ async def run_async(
                     failed_item_ids=failed_item_ids,
                     error_summary=error_summary,
                     has_more_items=has_more,
+                    item_evaluations=item_evaluations,
                 )
 
             # Check if we got any items
@@ -987,7 +993,7 @@ async def run_async(
             # Process items concurrently
             async def process_item(
                 item: Union[TraceWithFullDetails, ObservationsView],
-            ) -> Tuple[str, Union[Tuple[int, int, int], Exception]]:
+            ) -> Tuple[str, Union[Tuple[int, int, int, List[Evaluation]], Exception]]:
                 """Process a single item and return (item_id, result)."""
                 async with semaphore:
                     item_id = self._get_item_id(item, scope)
@@ -1021,11 +1027,16 @@ async def process_item(
                 else:
                     # Item processed successfully
                     total_items_processed += 1
-                    scores_created, composite_created, evals_failed = result
+                    scores_created, composite_created, evals_failed, evaluations = (
+                        result
+                    )
                     total_scores_created += scores_created
                     total_composite_scores_created += composite_created
                     total_evaluations_failed += evals_failed
 
+                    # Store evaluations for this item
+                    item_evaluations[item_id] = evaluations
+
                     # Update last processed tracking
                     last_item_timestamp = self._get_item_timestamp(item, scope)
                     last_item_id = item_id
@@ -1092,6 +1103,7 @@ async def process_item(
             has_more_items=(
                 has_more and max_items is not None and total_items_fetched >= max_items
             ),
+            item_evaluations=item_evaluations,
         )
 
     async def _fetch_batch_with_retry(
@@ -1148,7 +1160,7 @@ async def _process_batch_evaluation_item(
         composite_evaluator: Optional[CompositeEvaluatorFunction],
         metadata: Optional[Dict[str, Any]],
         evaluator_stats_dict: Dict[str, EvaluatorStats],
-    ) -> Tuple[int, int, int]:
+    ) -> Tuple[int, int, int, List[Evaluation]]:
         """Process a single item: map, evaluate, create scores.
 
         Args:
@@ -1161,7 +1173,7 @@ async def _process_batch_evaluation_item(
             evaluator_stats_dict: Dictionary tracking evaluator statistics.
 
         Returns:
-            Tuple of (scores_created, composite_scores_created, evaluations_failed).
+            Tuple of (scores_created, composite_scores_created, evaluations_failed, all_evaluations).
 
         Raises:
             Exception: If mapping fails or item processing encounters fatal error.
@@ -1235,10 +1247,18 @@ async def _process_batch_evaluation_item(
                     )
                     composite_scores_created += 1
 
+                # Add composite evaluations to the list
+                evaluations.extend(composite_evals)
+
             except Exception as e:
                 self._log.warning(f"Composite evaluator failed on item {item_id}: {e}")
 
-        return (scores_created, composite_scores_created, evaluations_failed)
+        return (
+            scores_created,
+            composite_scores_created,
+            evaluations_failed,
+            evaluations,
+        )
 
     async def _run_evaluator_internal(
         self,
@@ -1495,6 +1515,7 @@ def _build_result(
         failed_item_ids: List[str],
         error_summary: Dict[str, int],
         has_more_items: bool,
+        item_evaluations: Dict[str, List[Evaluation]],
     ) -> BatchEvaluationResult:
         """Build the final BatchEvaluationResult.
 
@@ -1512,6 +1533,7 @@ def _build_result(
             failed_item_ids: IDs of failed items.
             error_summary: Error type counts.
             has_more_items: Whether more items exist.
+            item_evaluations: Dictionary mapping item IDs to their evaluation results.
 
         Returns:
             BatchEvaluationResult instance.
@@ -1532,4 +1554,5 @@ def _build_result(
             failed_item_ids=failed_item_ids,
             error_summary=error_summary,
             has_more_items=has_more_items,
+            item_evaluations=item_evaluations,
         )
diff --git a/tests/test_batch_evaluation.py b/tests/test_batch_evaluation.py
@@ -207,13 +207,15 @@ def test_result_structure_fields(sample_traces, langfuse_client):
     assert hasattr(result, "failed_item_ids")
     assert hasattr(result, "error_summary")
     assert hasattr(result, "has_more_items")
+    assert hasattr(result, "item_evaluations")
 
     # Check types
     assert isinstance(result.evaluator_stats, list)
     assert isinstance(result.failed_item_ids, list)
     assert isinstance(result.error_summary, dict)
     assert isinstance(result.completed, bool)
     assert isinstance(result.has_more_items, bool)
+    assert isinstance(result.item_evaluations, dict)
 
 
 # ============================================================================
@@ -988,3 +990,113 @@ def test_verbose_logging(sample_traces, langfuse_client):
     )
 
     assert result.completed is True
+
+
+# ============================================================================
+# ITEM EVALUATIONS TESTS
+# ============================================================================
+
+
+def test_item_evaluations_basic(sample_traces, langfuse_client):
+    """Test that item_evaluations dict contains correct structure."""
+
+    def test_evaluator(*, input, output, **kwargs):
+        return Evaluation(name="test_metric", value=0.5)
+
+    result = langfuse_client.run_batched_evaluation(
+        scope="traces",
+        mapper=simple_trace_mapper,
+        evaluators=[test_evaluator],
+        max_items=3,
+    )
+
+    # Check that item_evaluations is a dict
+    assert isinstance(result.item_evaluations, dict)
+
+    # Should have evaluations for each processed item
+    assert len(result.item_evaluations) == result.total_items_processed
+
+    # Each entry should be a list of Evaluation objects
+    for item_id, evaluations in result.item_evaluations.items():
+        assert isinstance(item_id, str)
+        assert isinstance(evaluations, list)
+        assert all(isinstance(e, Evaluation) for e in evaluations)
+        # Should have one evaluation per evaluator
+        assert len(evaluations) == 1
+        assert evaluations[0].name == "test_metric"
+
+
+def test_item_evaluations_multiple_evaluators(sample_traces, langfuse_client):
+    """Test item_evaluations with multiple evaluators."""
+
+    def accuracy_evaluator(*, input, output, **kwargs):
+        return Evaluation(name="accuracy", value=0.8)
+
+    def relevance_evaluator(*, input, output, **kwargs):
+        return Evaluation(name="relevance", value=0.9)
+
+    result = langfuse_client.run_batched_evaluation(
+        scope="traces",
+        mapper=simple_trace_mapper,
+        evaluators=[accuracy_evaluator, relevance_evaluator],
+        max_items=2,
+    )
+
+    # Check structure
+    assert len(result.item_evaluations) == result.total_items_processed
+
+    # Each item should have evaluations from both evaluators
+    for item_id, evaluations in result.item_evaluations.items():
+        assert len(evaluations) == 2
+        eval_names = {e.name for e in evaluations}
+        assert eval_names == {"accuracy", "relevance"}
+
+
+def test_item_evaluations_with_composite(sample_traces, langfuse_client):
+    """Test that item_evaluations includes composite evaluations."""
+
+    def base_evaluator(*, input, output, **kwargs):
+        return Evaluation(name="base_score", value=0.7)
+
+    def composite_evaluator(*, input, output, expected_output, metadata, evaluations):
+        return Evaluation(
+            name="composite_score",
+            value=sum(
+                e.value for e in evaluations if isinstance(e.value, (int, float))
+            ),
+        )
+
+    result = langfuse_client.run_batched_evaluation(
+        scope="traces",
+        mapper=simple_trace_mapper,
+        evaluators=[base_evaluator],
+        composite_evaluator=composite_evaluator,
+        max_items=2,
+    )
+
+    # Each item should have both base and composite evaluations
+    for item_id, evaluations in result.item_evaluations.items():
+        assert len(evaluations) == 2
+        eval_names = {e.name for e in evaluations}
+        assert eval_names == {"base_score", "composite_score"}
+
+    # Verify composite scores were created
+    assert result.total_composite_scores_created > 0
+
+
+def test_item_evaluations_empty_on_failure(sample_traces, langfuse_client):
+    """Test that failed items don't appear in item_evaluations."""
+
+    def failing_mapper(*, item):
+        raise Exception("Mapper failed")
+
+    result = langfuse_client.run_batched_evaluation(
+        scope="traces",
+        mapper=failing_mapper,
+        evaluators=[simple_evaluator],
+        max_items=3,
+    )
+
+    # All items failed, so item_evaluations should be empty
+    assert len(result.item_evaluations) == 0
+    assert result.total_items_failed > 0