@@ -31,34 +31,6 @@ def langfuse_client():
3131 return get_client ()
3232
3333
34- @pytest .fixture
35- def sample_traces (langfuse_client ):
36- """Create sample traces in Langfuse for testing.
37-
38- Returns:
39- List of trace IDs created
40- """
41- trace_ids = []
42- test_session_id = f"test-session-{ create_uuid ()} "
43-
44- for i in range (5 ):
45- trace_name = f"batch-eval-test-{ create_uuid ()} "
46- with langfuse_client .start_as_current_span (name = trace_name ) as span :
47- span .update_trace (
48- input = f"Test input { i } " ,
49- output = f"Test output { i } " ,
50- metadata = {"test_index" : i , "test_type" : "batch_eval" },
51- session_id = test_session_id ,
52- tags = ["test" , "batch_evaluation" ],
53- )
54- trace_ids .append (langfuse_client .get_current_trace_id ())
55-
56- langfuse_client .flush ()
57- time .sleep (3 ) # Wait for API processing
58-
59- return trace_ids
60-
61-
6234@pytest .fixture
6335def sample_trace_name ():
6436 """Generate a unique trace name for filtering."""
@@ -92,7 +64,7 @@ def simple_evaluator(*, input, output, expected_output=None, metadata=None, **kw
9264# ============================================================================
9365
9466
95- def test_run_batched_evaluation_on_traces_basic (sample_traces , langfuse_client ):
67+ def test_run_batched_evaluation_on_traces_basic (langfuse_client ):
9668 """Test basic batch evaluation on traces."""
9769 result = langfuse_client .run_batched_evaluation (
9870 scope = "traces" ,
@@ -118,7 +90,7 @@ def test_run_batched_evaluation_on_traces_basic(sample_traces, langfuse_client):
11890 assert stats .name == "simple_evaluator"
11991
12092
121- def test_batch_evaluation_with_filter (sample_traces , langfuse_client ):
93+ def test_batch_evaluation_with_filter (langfuse_client ):
12294 """Test batch evaluation with JSON filter."""
12395 # Create a trace with specific tag
12496 unique_tag = f"test-filter-{ create_uuid ()} "
@@ -150,7 +122,7 @@ def test_batch_evaluation_with_filter(sample_traces, langfuse_client):
150122 assert result .completed is True
151123
152124
153- def test_batch_evaluation_with_metadata (sample_traces , langfuse_client ):
125+ def test_batch_evaluation_with_metadata (langfuse_client ):
154126 """Test that additional metadata is added to all scores."""
155127
156128 def metadata_checking_evaluator (* , input , output , metadata = None , ** kwargs ):
@@ -184,7 +156,7 @@ def metadata_checking_evaluator(*, input, output, metadata=None, **kwargs):
184156 assert result .completed is True
185157
186158
187- def test_result_structure_fields (sample_traces , langfuse_client ):
159+ def test_result_structure_fields (langfuse_client ):
188160 """Test that BatchEvaluationResult has all expected fields."""
189161 result = langfuse_client .run_batched_evaluation (
190162 scope = "traces" ,
@@ -223,7 +195,7 @@ def test_result_structure_fields(sample_traces, langfuse_client):
223195# ============================================================================
224196
225197
226- def test_simple_mapper (sample_traces , langfuse_client ):
198+ def test_simple_mapper (langfuse_client ):
227199 """Test basic mapper functionality."""
228200
229201 def custom_mapper (* , item ):
@@ -245,7 +217,7 @@ def custom_mapper(*, item):
245217
246218
247219@pytest .mark .asyncio
248- async def test_async_mapper (sample_traces , langfuse_client ):
220+ async def test_async_mapper (langfuse_client ):
249221 """Test that async mappers work correctly."""
250222
251223 async def async_mapper (* , item ):
@@ -268,7 +240,7 @@ async def async_mapper(*, item):
268240 assert result .total_items_processed > 0
269241
270242
271- def test_mapper_failure_handling (sample_traces , langfuse_client ):
243+ def test_mapper_failure_handling (langfuse_client ):
272244 """Test that mapper failures cause items to be skipped."""
273245
274246 def failing_mapper (* , item ):
@@ -287,7 +259,7 @@ def failing_mapper(*, item):
287259 assert "ValueError" in result .error_summary or "Exception" in result .error_summary
288260
289261
290- def test_mapper_with_missing_fields (sample_traces , langfuse_client ):
262+ def test_mapper_with_missing_fields (langfuse_client ):
291263 """Test mapper handles traces with missing fields gracefully."""
292264
293265 def robust_mapper (* , item ):
@@ -317,7 +289,7 @@ def robust_mapper(*, item):
317289# ============================================================================
318290
319291
320- def test_single_evaluator (sample_traces , langfuse_client ):
292+ def test_single_evaluator (langfuse_client ):
321293 """Test with a single evaluator."""
322294
323295 def quality_evaluator (* , input , output , ** kwargs ):
@@ -335,7 +307,7 @@ def quality_evaluator(*, input, output, **kwargs):
335307 assert result .evaluator_stats [0 ].name == "quality_evaluator"
336308
337309
338- def test_multiple_evaluators (sample_traces , langfuse_client ):
310+ def test_multiple_evaluators (langfuse_client ):
339311 """Test with multiple evaluators running in parallel."""
340312
341313 def accuracy_evaluator (* , input , output , ** kwargs ):
@@ -360,7 +332,7 @@ def safety_evaluator(*, input, output, **kwargs):
360332
361333
362334@pytest .mark .asyncio
363- async def test_async_evaluator (sample_traces , langfuse_client ):
335+ async def test_async_evaluator (langfuse_client ):
364336 """Test that async evaluators work correctly."""
365337
366338 async def async_evaluator (* , input , output , ** kwargs ):
@@ -377,7 +349,7 @@ async def async_evaluator(*, input, output, **kwargs):
377349 assert result .total_scores_created > 0
378350
379351
380- def test_evaluator_returning_list (sample_traces , langfuse_client ):
352+ def test_evaluator_returning_list (langfuse_client ):
381353 """Test evaluator that returns multiple Evaluations."""
382354
383355 def multi_score_evaluator (* , input , output , ** kwargs ):
@@ -398,7 +370,7 @@ def multi_score_evaluator(*, input, output, **kwargs):
398370 assert result .total_scores_created >= result .total_items_processed * 3
399371
400372
401- def test_evaluator_failure_statistics (sample_traces , langfuse_client ):
373+ def test_evaluator_failure_statistics (langfuse_client ):
402374 """Test that evaluator failures are tracked in statistics."""
403375
404376 def working_evaluator (* , input , output , ** kwargs ):
@@ -433,7 +405,7 @@ def failing_evaluator(*, input, output, **kwargs):
433405 assert result .total_evaluations_failed > 0
434406
435407
436- def test_mixed_sync_async_evaluators (sample_traces , langfuse_client ):
408+ def test_mixed_sync_async_evaluators (langfuse_client ):
437409 """Test mixing synchronous and asynchronous evaluators."""
438410
439411 def sync_evaluator (* , input , output , ** kwargs ):
@@ -459,7 +431,7 @@ async def async_evaluator(*, input, output, **kwargs):
459431# ============================================================================
460432
461433
462- def test_composite_evaluator_weighted_average (sample_traces , langfuse_client ):
434+ def test_composite_evaluator_weighted_average (langfuse_client ):
463435 """Test composite evaluator that computes weighted average."""
464436
465437 def accuracy_evaluator (* , input , output , ** kwargs ):
@@ -496,7 +468,7 @@ def composite_evaluator(*, input, output, expected_output, metadata, evaluations
496468 assert result .total_scores_created > result .total_composite_scores_created
497469
498470
499- def test_composite_evaluator_pass_fail (sample_traces , langfuse_client ):
471+ def test_composite_evaluator_pass_fail (langfuse_client ):
500472 """Test composite evaluator that implements pass/fail logic."""
501473
502474 def metric1_evaluator (* , input , output , ** kwargs ):
@@ -532,7 +504,7 @@ def pass_fail_composite(*, input, output, expected_output, metadata, evaluations
532504
533505
534506@pytest .mark .asyncio
535- async def test_async_composite_evaluator (sample_traces , langfuse_client ):
507+ async def test_async_composite_evaluator (langfuse_client ):
536508 """Test async composite evaluator."""
537509
538510 def evaluator1 (* , input , output , ** kwargs ):
@@ -556,7 +528,7 @@ async def async_composite(*, input, output, expected_output, metadata, evaluatio
556528 assert result .total_composite_scores_created > 0
557529
558530
559- def test_composite_evaluator_with_no_evaluations (sample_traces , langfuse_client ):
531+ def test_composite_evaluator_with_no_evaluations (langfuse_client ):
560532 """Test composite evaluator when no evaluations are present."""
561533
562534 def always_failing_evaluator (* , input , output , ** kwargs ):
@@ -578,7 +550,7 @@ def composite_evaluator(*, input, output, expected_output, metadata, evaluations
578550 assert result .total_composite_scores_created == 0
579551
580552
581- def test_composite_evaluator_failure_handling (sample_traces , langfuse_client ):
553+ def test_composite_evaluator_failure_handling (langfuse_client ):
582554 """Test that composite evaluator failures are handled gracefully."""
583555
584556 def evaluator1 (* , input , output , ** kwargs ):
@@ -606,7 +578,7 @@ def failing_composite(*, input, output, expected_output, metadata, evaluations):
606578# ============================================================================
607579
608580
609- def test_mapper_failure_skips_item (sample_traces , langfuse_client ):
581+ def test_mapper_failure_skips_item (langfuse_client ):
610582 """Test that mapper failure causes item to be skipped."""
611583
612584 call_count = {"count" : 0 }
@@ -629,7 +601,7 @@ def sometimes_failing_mapper(*, item):
629601 assert result .total_items_processed > 0
630602
631603
632- def test_evaluator_failure_continues (sample_traces , langfuse_client ):
604+ def test_evaluator_failure_continues (langfuse_client ):
633605 """Test that one evaluator failing doesn't stop others."""
634606
635607 def working_evaluator1 (* , input , output , ** kwargs ):
@@ -658,7 +630,7 @@ def working_evaluator2(*, input, output, **kwargs):
658630 assert failing_stats .failed_runs > 0
659631
660632
661- def test_all_evaluators_fail (sample_traces , langfuse_client ):
633+ def test_all_evaluators_fail (langfuse_client ):
662634 """Test when all evaluators fail but item is still processed."""
663635
664636 def failing_evaluator1 (* , input , output , ** kwargs ):
@@ -706,7 +678,7 @@ def test_empty_results_handling(langfuse_client):
706678 assert result .has_more_items is False
707679
708680
709- def test_max_items_zero (sample_traces , langfuse_client ):
681+ def test_max_items_zero (langfuse_client ):
710682 """Test with max_items=0 (should process no items)."""
711683 result = langfuse_client .run_batched_evaluation (
712684 scope = "traces" ,
@@ -719,7 +691,7 @@ def test_max_items_zero(sample_traces, langfuse_client):
719691 assert result .total_items_processed == 0
720692
721693
722- def test_evaluation_value_type_conversions (sample_traces , langfuse_client ):
694+ def test_evaluation_value_type_conversions (langfuse_client ):
723695 """Test that different evaluation value types are handled correctly."""
724696
725697 def multi_type_evaluator (* , input , output , ** kwargs ):
@@ -746,7 +718,7 @@ def multi_type_evaluator(*, input, output, **kwargs):
746718# ============================================================================
747719
748720
749- def test_pagination_with_max_items (sample_traces , langfuse_client ):
721+ def test_pagination_with_max_items (langfuse_client ):
750722 """Test that max_items limit is respected."""
751723 # Create more traces to ensure we have enough data
752724 for i in range (10 ):
@@ -777,7 +749,7 @@ def test_pagination_with_max_items(sample_traces, langfuse_client):
777749 assert result .total_items_processed <= 5
778750
779751
780- def test_has_more_items_flag (sample_traces , langfuse_client ):
752+ def test_has_more_items_flag (langfuse_client ):
781753 """Test that has_more_items flag is set correctly when max_items is reached."""
782754 # Create enough traces to exceed max_items
783755 batch_tag = f"batch-test-{ create_uuid ()} "
@@ -808,7 +780,7 @@ def test_has_more_items_flag(sample_traces, langfuse_client):
808780 assert result .has_more_items is True
809781
810782
811- def test_fetch_batch_size_parameter (sample_traces , langfuse_client ):
783+ def test_fetch_batch_size_parameter (langfuse_client ):
812784 """Test that different fetch_batch_size values work correctly."""
813785 for batch_size in [1 , 5 , 10 ]:
814786 result = langfuse_client .run_batched_evaluation (
@@ -850,7 +822,7 @@ def test_resume_token_structure(langfuse_client):
850822# ============================================================================
851823
852824
853- def test_max_concurrency_parameter (sample_traces , langfuse_client ):
825+ def test_max_concurrency_parameter (langfuse_client ):
854826 """Test that max_concurrency parameter works correctly."""
855827 for concurrency in [1 , 5 , 10 ]:
856828 result = langfuse_client .run_batched_evaluation (
@@ -870,7 +842,7 @@ def test_max_concurrency_parameter(sample_traces, langfuse_client):
870842# ============================================================================
871843
872844
873- def test_evaluator_stats_structure (sample_traces , langfuse_client ):
845+ def test_evaluator_stats_structure (langfuse_client ):
874846 """Test that EvaluatorStats has correct structure."""
875847
876848 def test_evaluator (* , input , output , ** kwargs ):
@@ -900,7 +872,7 @@ def test_evaluator(*, input, output, **kwargs):
900872 assert stats .failed_runs == 0
901873
902874
903- def test_evaluator_stats_tracking (sample_traces , langfuse_client ):
875+ def test_evaluator_stats_tracking (langfuse_client ):
904876 """Test that evaluator statistics are tracked correctly."""
905877
906878 call_count = {"count" : 0 }
@@ -925,7 +897,7 @@ def sometimes_failing_evaluator(*, input, output, **kwargs):
925897 assert stats .successful_runs + stats .failed_runs == stats .total_runs
926898
927899
928- def test_error_summary_aggregation (sample_traces , langfuse_client ):
900+ def test_error_summary_aggregation (langfuse_client ):
929901 """Test that error types are aggregated correctly in error_summary."""
930902
931903 def failing_mapper (* , item ):
@@ -943,7 +915,7 @@ def failing_mapper(*, item):
943915 assert any ("Error" in key for key in result .error_summary .keys ())
944916
945917
946- def test_failed_item_ids_collected (sample_traces , langfuse_client ):
918+ def test_failed_item_ids_collected (langfuse_client ):
947919 """Test that failed item IDs are collected."""
948920
949921 def failing_mapper (* , item ):
@@ -966,7 +938,7 @@ def failing_mapper(*, item):
966938# ============================================================================
967939
968940
969- def test_duration_tracking (sample_traces , langfuse_client ):
941+ def test_duration_tracking (langfuse_client ):
970942 """Test that duration is tracked correctly."""
971943 result = langfuse_client .run_batched_evaluation (
972944 scope = "traces" ,
@@ -979,7 +951,7 @@ def test_duration_tracking(sample_traces, langfuse_client):
979951 assert result .duration_seconds < 60 # Should complete quickly for small batch
980952
981953
982- def test_verbose_logging (sample_traces , langfuse_client ):
954+ def test_verbose_logging (langfuse_client ):
983955 """Test that verbose=True doesn't cause errors."""
984956 result = langfuse_client .run_batched_evaluation (
985957 scope = "traces" ,
@@ -997,7 +969,7 @@ def test_verbose_logging(sample_traces, langfuse_client):
997969# ============================================================================
998970
999971
1000- def test_item_evaluations_basic (sample_traces , langfuse_client ):
972+ def test_item_evaluations_basic (langfuse_client ):
1001973 """Test that item_evaluations dict contains correct structure."""
1002974
1003975 def test_evaluator (* , input , output , ** kwargs ):
@@ -1026,7 +998,7 @@ def test_evaluator(*, input, output, **kwargs):
1026998 assert evaluations [0 ].name == "test_metric"
1027999
10281000
1029- def test_item_evaluations_multiple_evaluators (sample_traces , langfuse_client ):
1001+ def test_item_evaluations_multiple_evaluators (langfuse_client ):
10301002 """Test item_evaluations with multiple evaluators."""
10311003
10321004 def accuracy_evaluator (* , input , output , ** kwargs ):
@@ -1052,7 +1024,7 @@ def relevance_evaluator(*, input, output, **kwargs):
10521024 assert eval_names == {"accuracy" , "relevance" }
10531025
10541026
1055- def test_item_evaluations_with_composite (sample_traces , langfuse_client ):
1027+ def test_item_evaluations_with_composite (langfuse_client ):
10561028 """Test that item_evaluations includes composite evaluations."""
10571029
10581030 def base_evaluator (* , input , output , ** kwargs ):
@@ -1084,7 +1056,7 @@ def composite_evaluator(*, input, output, expected_output, metadata, evaluations
10841056 assert result .total_composite_scores_created > 0
10851057
10861058
1087- def test_item_evaluations_empty_on_failure (sample_traces , langfuse_client ):
1059+ def test_item_evaluations_empty_on_failure (langfuse_client ):
10881060 """Test that failed items don't appear in item_evaluations."""
10891061
10901062 def failing_mapper (* , item ):
0 commit comments