Skip to content

Commit 47a37df

Browse files
committed
push
1 parent fec1786 commit 47a37df

2 files changed

Lines changed: 39 additions & 67 deletions

File tree

tests/test_batch_evaluation.py

Lines changed: 38 additions & 66 deletions
Original file line numberDiff line numberDiff line change
@@ -31,34 +31,6 @@ def langfuse_client():
3131
return get_client()
3232

3333

34-
@pytest.fixture
35-
def sample_traces(langfuse_client):
36-
"""Create sample traces in Langfuse for testing.
37-
38-
Returns:
39-
List of trace IDs created
40-
"""
41-
trace_ids = []
42-
test_session_id = f"test-session-{create_uuid()}"
43-
44-
for i in range(5):
45-
trace_name = f"batch-eval-test-{create_uuid()}"
46-
with langfuse_client.start_as_current_span(name=trace_name) as span:
47-
span.update_trace(
48-
input=f"Test input {i}",
49-
output=f"Test output {i}",
50-
metadata={"test_index": i, "test_type": "batch_eval"},
51-
session_id=test_session_id,
52-
tags=["test", "batch_evaluation"],
53-
)
54-
trace_ids.append(langfuse_client.get_current_trace_id())
55-
56-
langfuse_client.flush()
57-
time.sleep(3) # Wait for API processing
58-
59-
return trace_ids
60-
61-
6234
@pytest.fixture
6335
def sample_trace_name():
6436
"""Generate a unique trace name for filtering."""
@@ -92,7 +64,7 @@ def simple_evaluator(*, input, output, expected_output=None, metadata=None, **kw
9264
# ============================================================================
9365

9466

95-
def test_run_batched_evaluation_on_traces_basic(sample_traces, langfuse_client):
67+
def test_run_batched_evaluation_on_traces_basic(langfuse_client):
9668
"""Test basic batch evaluation on traces."""
9769
result = langfuse_client.run_batched_evaluation(
9870
scope="traces",
@@ -118,7 +90,7 @@ def test_run_batched_evaluation_on_traces_basic(sample_traces, langfuse_client):
11890
assert stats.name == "simple_evaluator"
11991

12092

121-
def test_batch_evaluation_with_filter(sample_traces, langfuse_client):
93+
def test_batch_evaluation_with_filter(langfuse_client):
12294
"""Test batch evaluation with JSON filter."""
12395
# Create a trace with specific tag
12496
unique_tag = f"test-filter-{create_uuid()}"
@@ -150,7 +122,7 @@ def test_batch_evaluation_with_filter(sample_traces, langfuse_client):
150122
assert result.completed is True
151123

152124

153-
def test_batch_evaluation_with_metadata(sample_traces, langfuse_client):
125+
def test_batch_evaluation_with_metadata(langfuse_client):
154126
"""Test that additional metadata is added to all scores."""
155127

156128
def metadata_checking_evaluator(*, input, output, metadata=None, **kwargs):
@@ -184,7 +156,7 @@ def metadata_checking_evaluator(*, input, output, metadata=None, **kwargs):
184156
assert result.completed is True
185157

186158

187-
def test_result_structure_fields(sample_traces, langfuse_client):
159+
def test_result_structure_fields(langfuse_client):
188160
"""Test that BatchEvaluationResult has all expected fields."""
189161
result = langfuse_client.run_batched_evaluation(
190162
scope="traces",
@@ -223,7 +195,7 @@ def test_result_structure_fields(sample_traces, langfuse_client):
223195
# ============================================================================
224196

225197

226-
def test_simple_mapper(sample_traces, langfuse_client):
198+
def test_simple_mapper(langfuse_client):
227199
"""Test basic mapper functionality."""
228200

229201
def custom_mapper(*, item):
@@ -245,7 +217,7 @@ def custom_mapper(*, item):
245217

246218

247219
@pytest.mark.asyncio
248-
async def test_async_mapper(sample_traces, langfuse_client):
220+
async def test_async_mapper(langfuse_client):
249221
"""Test that async mappers work correctly."""
250222

251223
async def async_mapper(*, item):
@@ -268,7 +240,7 @@ async def async_mapper(*, item):
268240
assert result.total_items_processed > 0
269241

270242

271-
def test_mapper_failure_handling(sample_traces, langfuse_client):
243+
def test_mapper_failure_handling(langfuse_client):
272244
"""Test that mapper failures cause items to be skipped."""
273245

274246
def failing_mapper(*, item):
@@ -287,7 +259,7 @@ def failing_mapper(*, item):
287259
assert "ValueError" in result.error_summary or "Exception" in result.error_summary
288260

289261

290-
def test_mapper_with_missing_fields(sample_traces, langfuse_client):
262+
def test_mapper_with_missing_fields(langfuse_client):
291263
"""Test mapper handles traces with missing fields gracefully."""
292264

293265
def robust_mapper(*, item):
@@ -317,7 +289,7 @@ def robust_mapper(*, item):
317289
# ============================================================================
318290

319291

320-
def test_single_evaluator(sample_traces, langfuse_client):
292+
def test_single_evaluator(langfuse_client):
321293
"""Test with a single evaluator."""
322294

323295
def quality_evaluator(*, input, output, **kwargs):
@@ -335,7 +307,7 @@ def quality_evaluator(*, input, output, **kwargs):
335307
assert result.evaluator_stats[0].name == "quality_evaluator"
336308

337309

338-
def test_multiple_evaluators(sample_traces, langfuse_client):
310+
def test_multiple_evaluators(langfuse_client):
339311
"""Test with multiple evaluators running in parallel."""
340312

341313
def accuracy_evaluator(*, input, output, **kwargs):
@@ -360,7 +332,7 @@ def safety_evaluator(*, input, output, **kwargs):
360332

361333

362334
@pytest.mark.asyncio
363-
async def test_async_evaluator(sample_traces, langfuse_client):
335+
async def test_async_evaluator(langfuse_client):
364336
"""Test that async evaluators work correctly."""
365337

366338
async def async_evaluator(*, input, output, **kwargs):
@@ -377,7 +349,7 @@ async def async_evaluator(*, input, output, **kwargs):
377349
assert result.total_scores_created > 0
378350

379351

380-
def test_evaluator_returning_list(sample_traces, langfuse_client):
352+
def test_evaluator_returning_list(langfuse_client):
381353
"""Test evaluator that returns multiple Evaluations."""
382354

383355
def multi_score_evaluator(*, input, output, **kwargs):
@@ -398,7 +370,7 @@ def multi_score_evaluator(*, input, output, **kwargs):
398370
assert result.total_scores_created >= result.total_items_processed * 3
399371

400372

401-
def test_evaluator_failure_statistics(sample_traces, langfuse_client):
373+
def test_evaluator_failure_statistics(langfuse_client):
402374
"""Test that evaluator failures are tracked in statistics."""
403375

404376
def working_evaluator(*, input, output, **kwargs):
@@ -433,7 +405,7 @@ def failing_evaluator(*, input, output, **kwargs):
433405
assert result.total_evaluations_failed > 0
434406

435407

436-
def test_mixed_sync_async_evaluators(sample_traces, langfuse_client):
408+
def test_mixed_sync_async_evaluators(langfuse_client):
437409
"""Test mixing synchronous and asynchronous evaluators."""
438410

439411
def sync_evaluator(*, input, output, **kwargs):
@@ -459,7 +431,7 @@ async def async_evaluator(*, input, output, **kwargs):
459431
# ============================================================================
460432

461433

462-
def test_composite_evaluator_weighted_average(sample_traces, langfuse_client):
434+
def test_composite_evaluator_weighted_average(langfuse_client):
463435
"""Test composite evaluator that computes weighted average."""
464436

465437
def accuracy_evaluator(*, input, output, **kwargs):
@@ -496,7 +468,7 @@ def composite_evaluator(*, input, output, expected_output, metadata, evaluations
496468
assert result.total_scores_created > result.total_composite_scores_created
497469

498470

499-
def test_composite_evaluator_pass_fail(sample_traces, langfuse_client):
471+
def test_composite_evaluator_pass_fail(langfuse_client):
500472
"""Test composite evaluator that implements pass/fail logic."""
501473

502474
def metric1_evaluator(*, input, output, **kwargs):
@@ -532,7 +504,7 @@ def pass_fail_composite(*, input, output, expected_output, metadata, evaluations
532504

533505

534506
@pytest.mark.asyncio
535-
async def test_async_composite_evaluator(sample_traces, langfuse_client):
507+
async def test_async_composite_evaluator(langfuse_client):
536508
"""Test async composite evaluator."""
537509

538510
def evaluator1(*, input, output, **kwargs):
@@ -556,7 +528,7 @@ async def async_composite(*, input, output, expected_output, metadata, evaluatio
556528
assert result.total_composite_scores_created > 0
557529

558530

559-
def test_composite_evaluator_with_no_evaluations(sample_traces, langfuse_client):
531+
def test_composite_evaluator_with_no_evaluations(langfuse_client):
560532
"""Test composite evaluator when no evaluations are present."""
561533

562534
def always_failing_evaluator(*, input, output, **kwargs):
@@ -578,7 +550,7 @@ def composite_evaluator(*, input, output, expected_output, metadata, evaluations
578550
assert result.total_composite_scores_created == 0
579551

580552

581-
def test_composite_evaluator_failure_handling(sample_traces, langfuse_client):
553+
def test_composite_evaluator_failure_handling(langfuse_client):
582554
"""Test that composite evaluator failures are handled gracefully."""
583555

584556
def evaluator1(*, input, output, **kwargs):
@@ -606,7 +578,7 @@ def failing_composite(*, input, output, expected_output, metadata, evaluations):
606578
# ============================================================================
607579

608580

609-
def test_mapper_failure_skips_item(sample_traces, langfuse_client):
581+
def test_mapper_failure_skips_item(langfuse_client):
610582
"""Test that mapper failure causes item to be skipped."""
611583

612584
call_count = {"count": 0}
@@ -629,7 +601,7 @@ def sometimes_failing_mapper(*, item):
629601
assert result.total_items_processed > 0
630602

631603

632-
def test_evaluator_failure_continues(sample_traces, langfuse_client):
604+
def test_evaluator_failure_continues(langfuse_client):
633605
"""Test that one evaluator failing doesn't stop others."""
634606

635607
def working_evaluator1(*, input, output, **kwargs):
@@ -658,7 +630,7 @@ def working_evaluator2(*, input, output, **kwargs):
658630
assert failing_stats.failed_runs > 0
659631

660632

661-
def test_all_evaluators_fail(sample_traces, langfuse_client):
633+
def test_all_evaluators_fail(langfuse_client):
662634
"""Test when all evaluators fail but item is still processed."""
663635

664636
def failing_evaluator1(*, input, output, **kwargs):
@@ -706,7 +678,7 @@ def test_empty_results_handling(langfuse_client):
706678
assert result.has_more_items is False
707679

708680

709-
def test_max_items_zero(sample_traces, langfuse_client):
681+
def test_max_items_zero(langfuse_client):
710682
"""Test with max_items=0 (should process no items)."""
711683
result = langfuse_client.run_batched_evaluation(
712684
scope="traces",
@@ -719,7 +691,7 @@ def test_max_items_zero(sample_traces, langfuse_client):
719691
assert result.total_items_processed == 0
720692

721693

722-
def test_evaluation_value_type_conversions(sample_traces, langfuse_client):
694+
def test_evaluation_value_type_conversions(langfuse_client):
723695
"""Test that different evaluation value types are handled correctly."""
724696

725697
def multi_type_evaluator(*, input, output, **kwargs):
@@ -746,7 +718,7 @@ def multi_type_evaluator(*, input, output, **kwargs):
746718
# ============================================================================
747719

748720

749-
def test_pagination_with_max_items(sample_traces, langfuse_client):
721+
def test_pagination_with_max_items(langfuse_client):
750722
"""Test that max_items limit is respected."""
751723
# Create more traces to ensure we have enough data
752724
for i in range(10):
@@ -777,7 +749,7 @@ def test_pagination_with_max_items(sample_traces, langfuse_client):
777749
assert result.total_items_processed <= 5
778750

779751

780-
def test_has_more_items_flag(sample_traces, langfuse_client):
752+
def test_has_more_items_flag(langfuse_client):
781753
"""Test that has_more_items flag is set correctly when max_items is reached."""
782754
# Create enough traces to exceed max_items
783755
batch_tag = f"batch-test-{create_uuid()}"
@@ -808,7 +780,7 @@ def test_has_more_items_flag(sample_traces, langfuse_client):
808780
assert result.has_more_items is True
809781

810782

811-
def test_fetch_batch_size_parameter(sample_traces, langfuse_client):
783+
def test_fetch_batch_size_parameter(langfuse_client):
812784
"""Test that different fetch_batch_size values work correctly."""
813785
for batch_size in [1, 5, 10]:
814786
result = langfuse_client.run_batched_evaluation(
@@ -850,7 +822,7 @@ def test_resume_token_structure(langfuse_client):
850822
# ============================================================================
851823

852824

853-
def test_max_concurrency_parameter(sample_traces, langfuse_client):
825+
def test_max_concurrency_parameter(langfuse_client):
854826
"""Test that max_concurrency parameter works correctly."""
855827
for concurrency in [1, 5, 10]:
856828
result = langfuse_client.run_batched_evaluation(
@@ -870,7 +842,7 @@ def test_max_concurrency_parameter(sample_traces, langfuse_client):
870842
# ============================================================================
871843

872844

873-
def test_evaluator_stats_structure(sample_traces, langfuse_client):
845+
def test_evaluator_stats_structure(langfuse_client):
874846
"""Test that EvaluatorStats has correct structure."""
875847

876848
def test_evaluator(*, input, output, **kwargs):
@@ -900,7 +872,7 @@ def test_evaluator(*, input, output, **kwargs):
900872
assert stats.failed_runs == 0
901873

902874

903-
def test_evaluator_stats_tracking(sample_traces, langfuse_client):
875+
def test_evaluator_stats_tracking(langfuse_client):
904876
"""Test that evaluator statistics are tracked correctly."""
905877

906878
call_count = {"count": 0}
@@ -925,7 +897,7 @@ def sometimes_failing_evaluator(*, input, output, **kwargs):
925897
assert stats.successful_runs + stats.failed_runs == stats.total_runs
926898

927899

928-
def test_error_summary_aggregation(sample_traces, langfuse_client):
900+
def test_error_summary_aggregation(langfuse_client):
929901
"""Test that error types are aggregated correctly in error_summary."""
930902

931903
def failing_mapper(*, item):
@@ -943,7 +915,7 @@ def failing_mapper(*, item):
943915
assert any("Error" in key for key in result.error_summary.keys())
944916

945917

946-
def test_failed_item_ids_collected(sample_traces, langfuse_client):
918+
def test_failed_item_ids_collected(langfuse_client):
947919
"""Test that failed item IDs are collected."""
948920

949921
def failing_mapper(*, item):
@@ -966,7 +938,7 @@ def failing_mapper(*, item):
966938
# ============================================================================
967939

968940

969-
def test_duration_tracking(sample_traces, langfuse_client):
941+
def test_duration_tracking(langfuse_client):
970942
"""Test that duration is tracked correctly."""
971943
result = langfuse_client.run_batched_evaluation(
972944
scope="traces",
@@ -979,7 +951,7 @@ def test_duration_tracking(sample_traces, langfuse_client):
979951
assert result.duration_seconds < 60 # Should complete quickly for small batch
980952

981953

982-
def test_verbose_logging(sample_traces, langfuse_client):
954+
def test_verbose_logging(langfuse_client):
983955
"""Test that verbose=True doesn't cause errors."""
984956
result = langfuse_client.run_batched_evaluation(
985957
scope="traces",
@@ -997,7 +969,7 @@ def test_verbose_logging(sample_traces, langfuse_client):
997969
# ============================================================================
998970

999971

1000-
def test_item_evaluations_basic(sample_traces, langfuse_client):
972+
def test_item_evaluations_basic(langfuse_client):
1001973
"""Test that item_evaluations dict contains correct structure."""
1002974

1003975
def test_evaluator(*, input, output, **kwargs):
@@ -1026,7 +998,7 @@ def test_evaluator(*, input, output, **kwargs):
1026998
assert evaluations[0].name == "test_metric"
1027999

10281000

1029-
def test_item_evaluations_multiple_evaluators(sample_traces, langfuse_client):
1001+
def test_item_evaluations_multiple_evaluators(langfuse_client):
10301002
"""Test item_evaluations with multiple evaluators."""
10311003

10321004
def accuracy_evaluator(*, input, output, **kwargs):
@@ -1052,7 +1024,7 @@ def relevance_evaluator(*, input, output, **kwargs):
10521024
assert eval_names == {"accuracy", "relevance"}
10531025

10541026

1055-
def test_item_evaluations_with_composite(sample_traces, langfuse_client):
1027+
def test_item_evaluations_with_composite(langfuse_client):
10561028
"""Test that item_evaluations includes composite evaluations."""
10571029

10581030
def base_evaluator(*, input, output, **kwargs):
@@ -1084,7 +1056,7 @@ def composite_evaluator(*, input, output, expected_output, metadata, evaluations
10841056
assert result.total_composite_scores_created > 0
10851057

10861058

1087-
def test_item_evaluations_empty_on_failure(sample_traces, langfuse_client):
1059+
def test_item_evaluations_empty_on_failure(langfuse_client):
10881060
"""Test that failed items don't appear in item_evaluations."""
10891061

10901062
def failing_mapper(*, item):

tests/utils.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ def create_uuid():
1717

1818

1919
def get_api():
20-
sleep(3)
20+
sleep(2)
2121

2222
return FernLangfuse(
2323
username=os.environ.get("LANGFUSE_PUBLIC_KEY"),

0 commit comments

Comments
 (0)