Skip to content

Commit 06a6e37

Browse files
committed
add item evaluations
1 parent 66288a6 commit 06a6e37

2 files changed

Lines changed: 140 additions & 5 deletions

File tree

langfuse/batch_evaluation.py

Lines changed: 28 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -594,6 +594,7 @@ class BatchEvaluationResult:
594594
failed_item_ids: List of IDs for items that failed evaluation.
595595
error_summary: Dictionary mapping error types to occurrence counts.
596596
has_more_items: True if max_items limit was reached but more items exist.
597+
item_evaluations: Dictionary mapping item IDs to their evaluation results (both regular and composite).
597598
598599
Examples:
599600
Basic result inspection:
@@ -690,6 +691,7 @@ def __init__(
690691
failed_item_ids: List[str],
691692
error_summary: Dict[str, int],
692693
has_more_items: bool,
694+
item_evaluations: Dict[str, List["Evaluation"]],
693695
):
694696
"""Initialize BatchEvaluationResult with comprehensive statistics.
695697
@@ -707,6 +709,7 @@ def __init__(
707709
failed_item_ids: IDs of failed items.
708710
error_summary: Error types and counts.
709711
has_more_items: Whether more items exist beyond max_items.
712+
item_evaluations: Dictionary mapping item IDs to their evaluation results.
710713
711714
Note:
712715
All arguments must be provided as keywords.
@@ -724,6 +727,7 @@ def __init__(
724727
self.failed_item_ids = failed_item_ids
725728
self.error_summary = error_summary
726729
self.has_more_items = has_more_items
730+
self.item_evaluations = item_evaluations
727731

728732
def __str__(self) -> str:
729733
"""Return a formatted string representation of the batch evaluation results.
@@ -884,6 +888,7 @@ async def run_async(
884888
total_evaluations_failed = 0
885889
failed_item_ids: List[str] = []
886890
error_summary: Dict[str, int] = {}
891+
item_evaluations: Dict[str, List[Evaluation]] = {}
887892

888893
# Initialize evaluator stats
889894
evaluator_stats_dict = {
@@ -958,6 +963,7 @@ async def run_async(
958963
failed_item_ids=failed_item_ids,
959964
error_summary=error_summary,
960965
has_more_items=has_more,
966+
item_evaluations=item_evaluations,
961967
)
962968

963969
# Check if we got any items
@@ -987,7 +993,7 @@ async def run_async(
987993
# Process items concurrently
988994
async def process_item(
989995
item: Union[TraceWithFullDetails, ObservationsView],
990-
) -> Tuple[str, Union[Tuple[int, int, int], Exception]]:
996+
) -> Tuple[str, Union[Tuple[int, int, int, List[Evaluation]], Exception]]:
991997
"""Process a single item and return (item_id, result)."""
992998
async with semaphore:
993999
item_id = self._get_item_id(item, scope)
@@ -1021,11 +1027,16 @@ async def process_item(
10211027
else:
10221028
# Item processed successfully
10231029
total_items_processed += 1
1024-
scores_created, composite_created, evals_failed = result
1030+
scores_created, composite_created, evals_failed, evaluations = (
1031+
result
1032+
)
10251033
total_scores_created += scores_created
10261034
total_composite_scores_created += composite_created
10271035
total_evaluations_failed += evals_failed
10281036

1037+
# Store evaluations for this item
1038+
item_evaluations[item_id] = evaluations
1039+
10291040
# Update last processed tracking
10301041
last_item_timestamp = self._get_item_timestamp(item, scope)
10311042
last_item_id = item_id
@@ -1092,6 +1103,7 @@ async def process_item(
10921103
has_more_items=(
10931104
has_more and max_items is not None and total_items_fetched >= max_items
10941105
),
1106+
item_evaluations=item_evaluations,
10951107
)
10961108

10971109
async def _fetch_batch_with_retry(
@@ -1148,7 +1160,7 @@ async def _process_batch_evaluation_item(
11481160
composite_evaluator: Optional[CompositeEvaluatorFunction],
11491161
metadata: Optional[Dict[str, Any]],
11501162
evaluator_stats_dict: Dict[str, EvaluatorStats],
1151-
) -> Tuple[int, int, int]:
1163+
) -> Tuple[int, int, int, List[Evaluation]]:
11521164
"""Process a single item: map, evaluate, create scores.
11531165
11541166
Args:
@@ -1161,7 +1173,7 @@ async def _process_batch_evaluation_item(
11611173
evaluator_stats_dict: Dictionary tracking evaluator statistics.
11621174
11631175
Returns:
1164-
Tuple of (scores_created, composite_scores_created, evaluations_failed).
1176+
Tuple of (scores_created, composite_scores_created, evaluations_failed, all_evaluations).
11651177
11661178
Raises:
11671179
Exception: If mapping fails or item processing encounters fatal error.
@@ -1235,10 +1247,18 @@ async def _process_batch_evaluation_item(
12351247
)
12361248
composite_scores_created += 1
12371249

1250+
# Add composite evaluations to the list
1251+
evaluations.extend(composite_evals)
1252+
12381253
except Exception as e:
12391254
self._log.warning(f"Composite evaluator failed on item {item_id}: {e}")
12401255

1241-
return (scores_created, composite_scores_created, evaluations_failed)
1256+
return (
1257+
scores_created,
1258+
composite_scores_created,
1259+
evaluations_failed,
1260+
evaluations,
1261+
)
12421262

12431263
async def _run_evaluator_internal(
12441264
self,
@@ -1495,6 +1515,7 @@ def _build_result(
14951515
failed_item_ids: List[str],
14961516
error_summary: Dict[str, int],
14971517
has_more_items: bool,
1518+
item_evaluations: Dict[str, List[Evaluation]],
14981519
) -> BatchEvaluationResult:
14991520
"""Build the final BatchEvaluationResult.
15001521
@@ -1512,6 +1533,7 @@ def _build_result(
15121533
failed_item_ids: IDs of failed items.
15131534
error_summary: Error type counts.
15141535
has_more_items: Whether more items exist.
1536+
item_evaluations: Dictionary mapping item IDs to their evaluation results.
15151537
15161538
Returns:
15171539
BatchEvaluationResult instance.
@@ -1532,4 +1554,5 @@ def _build_result(
15321554
failed_item_ids=failed_item_ids,
15331555
error_summary=error_summary,
15341556
has_more_items=has_more_items,
1557+
item_evaluations=item_evaluations,
15351558
)

tests/test_batch_evaluation.py

Lines changed: 112 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -207,13 +207,15 @@ def test_result_structure_fields(sample_traces, langfuse_client):
207207
assert hasattr(result, "failed_item_ids")
208208
assert hasattr(result, "error_summary")
209209
assert hasattr(result, "has_more_items")
210+
assert hasattr(result, "item_evaluations")
210211

211212
# Check types
212213
assert isinstance(result.evaluator_stats, list)
213214
assert isinstance(result.failed_item_ids, list)
214215
assert isinstance(result.error_summary, dict)
215216
assert isinstance(result.completed, bool)
216217
assert isinstance(result.has_more_items, bool)
218+
assert isinstance(result.item_evaluations, dict)
217219

218220

219221
# ============================================================================
@@ -988,3 +990,113 @@ def test_verbose_logging(sample_traces, langfuse_client):
988990
)
989991

990992
assert result.completed is True
993+
994+
995+
# ============================================================================
996+
# ITEM EVALUATIONS TESTS
997+
# ============================================================================
998+
999+
1000+
def test_item_evaluations_basic(sample_traces, langfuse_client):
1001+
"""Test that item_evaluations dict contains correct structure."""
1002+
1003+
def test_evaluator(*, input, output, **kwargs):
1004+
return Evaluation(name="test_metric", value=0.5)
1005+
1006+
result = langfuse_client.run_batched_evaluation(
1007+
scope="traces",
1008+
mapper=simple_trace_mapper,
1009+
evaluators=[test_evaluator],
1010+
max_items=3,
1011+
)
1012+
1013+
# Check that item_evaluations is a dict
1014+
assert isinstance(result.item_evaluations, dict)
1015+
1016+
# Should have evaluations for each processed item
1017+
assert len(result.item_evaluations) == result.total_items_processed
1018+
1019+
# Each entry should be a list of Evaluation objects
1020+
for item_id, evaluations in result.item_evaluations.items():
1021+
assert isinstance(item_id, str)
1022+
assert isinstance(evaluations, list)
1023+
assert all(isinstance(e, Evaluation) for e in evaluations)
1024+
# Should have one evaluation per evaluator
1025+
assert len(evaluations) == 1
1026+
assert evaluations[0].name == "test_metric"
1027+
1028+
1029+
def test_item_evaluations_multiple_evaluators(sample_traces, langfuse_client):
1030+
"""Test item_evaluations with multiple evaluators."""
1031+
1032+
def accuracy_evaluator(*, input, output, **kwargs):
1033+
return Evaluation(name="accuracy", value=0.8)
1034+
1035+
def relevance_evaluator(*, input, output, **kwargs):
1036+
return Evaluation(name="relevance", value=0.9)
1037+
1038+
result = langfuse_client.run_batched_evaluation(
1039+
scope="traces",
1040+
mapper=simple_trace_mapper,
1041+
evaluators=[accuracy_evaluator, relevance_evaluator],
1042+
max_items=2,
1043+
)
1044+
1045+
# Check structure
1046+
assert len(result.item_evaluations) == result.total_items_processed
1047+
1048+
# Each item should have evaluations from both evaluators
1049+
for item_id, evaluations in result.item_evaluations.items():
1050+
assert len(evaluations) == 2
1051+
eval_names = {e.name for e in evaluations}
1052+
assert eval_names == {"accuracy", "relevance"}
1053+
1054+
1055+
def test_item_evaluations_with_composite(sample_traces, langfuse_client):
1056+
"""Test that item_evaluations includes composite evaluations."""
1057+
1058+
def base_evaluator(*, input, output, **kwargs):
1059+
return Evaluation(name="base_score", value=0.7)
1060+
1061+
def composite_evaluator(*, input, output, expected_output, metadata, evaluations):
1062+
return Evaluation(
1063+
name="composite_score",
1064+
value=sum(
1065+
e.value for e in evaluations if isinstance(e.value, (int, float))
1066+
),
1067+
)
1068+
1069+
result = langfuse_client.run_batched_evaluation(
1070+
scope="traces",
1071+
mapper=simple_trace_mapper,
1072+
evaluators=[base_evaluator],
1073+
composite_evaluator=composite_evaluator,
1074+
max_items=2,
1075+
)
1076+
1077+
# Each item should have both base and composite evaluations
1078+
for item_id, evaluations in result.item_evaluations.items():
1079+
assert len(evaluations) == 2
1080+
eval_names = {e.name for e in evaluations}
1081+
assert eval_names == {"base_score", "composite_score"}
1082+
1083+
# Verify composite scores were created
1084+
assert result.total_composite_scores_created > 0
1085+
1086+
1087+
def test_item_evaluations_empty_on_failure(sample_traces, langfuse_client):
1088+
"""Test that failed items don't appear in item_evaluations."""
1089+
1090+
def failing_mapper(*, item):
1091+
raise Exception("Mapper failed")
1092+
1093+
result = langfuse_client.run_batched_evaluation(
1094+
scope="traces",
1095+
mapper=failing_mapper,
1096+
evaluators=[simple_evaluator],
1097+
max_items=3,
1098+
)
1099+
1100+
# All items failed, so item_evaluations should be empty
1101+
assert len(result.item_evaluations) == 0
1102+
assert result.total_items_failed > 0

0 commit comments

Comments
 (0)