Skip to content

Commit 48ed142

Browse files
committed
push
1 parent 929a6a2 commit 48ed142

1 file changed

Lines changed: 3 additions & 111 deletions

File tree

tests/test_batch_evaluation.py

Lines changed: 3 additions & 111 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,9 @@
2525
# ============================================================================
2626

2727

28+
pytestmark = pytest.mark.skip(reason="Reason for skipping this file")
29+
30+
2831
@pytest.fixture
2932
def langfuse_client():
3033
"""Get a Langfuse client for testing."""
@@ -90,9 +93,6 @@ def test_run_batched_evaluation_on_traces_basic(langfuse_client):
9093
assert stats.name == "simple_evaluator"
9194

9295

93-
@pytest.mark.skip(
94-
reason="Single Github CI runner overwhelmed by amount of scores created"
95-
)
9696
def test_batch_evaluation_with_filter(langfuse_client):
9797
"""Test batch evaluation with JSON filter."""
9898
# Create a trace with specific tag
@@ -125,9 +125,6 @@ def test_batch_evaluation_with_filter(langfuse_client):
125125
assert result.completed is True
126126

127127

128-
@pytest.mark.skip(
129-
reason="Single Github CI runner overwhelmed by amount of scores created"
130-
)
131128
def test_batch_evaluation_with_metadata(langfuse_client):
132129
"""Test that additional metadata is added to all scores."""
133130

@@ -162,9 +159,6 @@ def metadata_checking_evaluator(*, input, output, metadata=None, **kwargs):
162159
assert result.completed is True
163160

164161

165-
@pytest.mark.skip(
166-
reason="Single Github CI runner overwhelmed by amount of scores created"
167-
)
168162
def test_result_structure_fields(langfuse_client):
169163
"""Test that BatchEvaluationResult has all expected fields."""
170164
result = langfuse_client.run_batched_evaluation(
@@ -204,9 +198,6 @@ def test_result_structure_fields(langfuse_client):
204198
# ============================================================================
205199

206200

207-
@pytest.mark.skip(
208-
reason="Single Github CI runner overwhelmed by amount of scores created"
209-
)
210201
def test_simple_mapper(langfuse_client):
211202
"""Test basic mapper functionality."""
212203

@@ -228,9 +219,6 @@ def custom_mapper(*, item):
228219
assert result.total_items_processed > 0
229220

230221

231-
@pytest.mark.skip(
232-
reason="Single Github CI runner overwhelmed by amount of scores created"
233-
)
234222
@pytest.mark.asyncio
235223
async def test_async_mapper(langfuse_client):
236224
"""Test that async mappers work correctly."""
@@ -255,9 +243,6 @@ async def async_mapper(*, item):
255243
assert result.total_items_processed > 0
256244

257245

258-
@pytest.mark.skip(
259-
reason="Single Github CI runner overwhelmed by amount of scores created"
260-
)
261246
def test_mapper_failure_handling(langfuse_client):
262247
"""Test that mapper failures cause items to be skipped."""
263248

@@ -277,9 +262,6 @@ def failing_mapper(*, item):
277262
assert "ValueError" in result.error_summary or "Exception" in result.error_summary
278263

279264

280-
@pytest.mark.skip(
281-
reason="Single Github CI runner overwhelmed by amount of scores created"
282-
)
283265
def test_mapper_with_missing_fields(langfuse_client):
284266
"""Test mapper handles traces with missing fields gracefully."""
285267

@@ -310,9 +292,6 @@ def robust_mapper(*, item):
310292
# ============================================================================
311293

312294

313-
@pytest.mark.skip(
314-
reason="Single Github CI runner overwhelmed by amount of scores created"
315-
)
316295
def test_single_evaluator(langfuse_client):
317296
"""Test with a single evaluator."""
318297

@@ -331,9 +310,6 @@ def quality_evaluator(*, input, output, **kwargs):
331310
assert result.evaluator_stats[0].name == "quality_evaluator"
332311

333312

334-
@pytest.mark.skip(
335-
reason="Single Github CI runner overwhelmed by amount of scores created"
336-
)
337313
def test_multiple_evaluators(langfuse_client):
338314
"""Test with multiple evaluators running in parallel."""
339315

@@ -358,9 +334,6 @@ def safety_evaluator(*, input, output, **kwargs):
358334
assert result.total_scores_created >= result.total_items_processed * 3
359335

360336

361-
@pytest.mark.skip(
362-
reason="Single Github CI runner overwhelmed by amount of scores created"
363-
)
364337
@pytest.mark.asyncio
365338
async def test_async_evaluator(langfuse_client):
366339
"""Test that async evaluators work correctly."""
@@ -400,9 +373,6 @@ def multi_score_evaluator(*, input, output, **kwargs):
400373
assert result.total_scores_created >= result.total_items_processed * 3
401374

402375

403-
@pytest.mark.skip(
404-
reason="Single Github CI runner overwhelmed by amount of scores created"
405-
)
406376
def test_evaluator_failure_statistics(langfuse_client):
407377
"""Test that evaluator failures are tracked in statistics."""
408378

@@ -438,9 +408,6 @@ def failing_evaluator(*, input, output, **kwargs):
438408
assert result.total_evaluations_failed > 0
439409

440410

441-
@pytest.mark.skip(
442-
reason="Single Github CI runner overwhelmed by amount of scores created"
443-
)
444411
def test_mixed_sync_async_evaluators(langfuse_client):
445412
"""Test mixing synchronous and asynchronous evaluators."""
446413

@@ -504,9 +471,6 @@ def composite_evaluator(*, input, output, expected_output, metadata, evaluations
504471
assert result.total_scores_created > result.total_composite_scores_created
505472

506473

507-
@pytest.mark.skip(
508-
reason="Single Github CI runner overwhelmed by amount of scores created"
509-
)
510474
def test_composite_evaluator_pass_fail(langfuse_client):
511475
"""Test composite evaluator that implements pass/fail logic."""
512476

@@ -542,9 +506,6 @@ def pass_fail_composite(*, input, output, expected_output, metadata, evaluations
542506
assert result.total_composite_scores_created > 0
543507

544508

545-
@pytest.mark.skip(
546-
reason="Single Github CI runner overwhelmed by amount of scores created"
547-
)
548509
@pytest.mark.asyncio
549510
async def test_async_composite_evaluator(langfuse_client):
550511
"""Test async composite evaluator."""
@@ -570,9 +531,6 @@ async def async_composite(*, input, output, expected_output, metadata, evaluatio
570531
assert result.total_composite_scores_created > 0
571532

572533

573-
@pytest.mark.skip(
574-
reason="Single Github CI runner overwhelmed by amount of scores created"
575-
)
576534
def test_composite_evaluator_with_no_evaluations(langfuse_client):
577535
"""Test composite evaluator when no evaluations are present."""
578536

@@ -595,9 +553,6 @@ def composite_evaluator(*, input, output, expected_output, metadata, evaluations
595553
assert result.total_composite_scores_created == 0
596554

597555

598-
@pytest.mark.skip(
599-
reason="Single Github CI runner overwhelmed by amount of scores created"
600-
)
601556
def test_composite_evaluator_failure_handling(langfuse_client):
602557
"""Test that composite evaluator failures are handled gracefully."""
603558

@@ -626,9 +581,6 @@ def failing_composite(*, input, output, expected_output, metadata, evaluations):
626581
# ============================================================================
627582

628583

629-
@pytest.mark.skip(
630-
reason="Single Github CI runner overwhelmed by amount of scores created"
631-
)
632584
def test_mapper_failure_skips_item(langfuse_client):
633585
"""Test that mapper failure causes item to be skipped."""
634586

@@ -652,9 +604,6 @@ def sometimes_failing_mapper(*, item):
652604
assert result.total_items_processed > 0
653605

654606

655-
@pytest.mark.skip(
656-
reason="Single Github CI runner overwhelmed by amount of scores created"
657-
)
658607
def test_evaluator_failure_continues(langfuse_client):
659608
"""Test that one evaluator failing doesn't stop others."""
660609

@@ -684,9 +633,6 @@ def working_evaluator2(*, input, output, **kwargs):
684633
assert failing_stats.failed_runs > 0
685634

686635

687-
@pytest.mark.skip(
688-
reason="Single Github CI runner overwhelmed by amount of scores created"
689-
)
690636
def test_all_evaluators_fail(langfuse_client):
691637
"""Test when all evaluators fail but item is still processed."""
692638

@@ -716,9 +662,6 @@ def failing_evaluator2(*, input, output, **kwargs):
716662
# ============================================================================
717663

718664

719-
@pytest.mark.skip(
720-
reason="Single Github CI runner overwhelmed by amount of scores created"
721-
)
722665
def test_empty_results_handling(langfuse_client):
723666
"""Test batch evaluation when filter returns no items."""
724667
nonexistent_name = f"nonexistent-trace-{create_uuid()}"
@@ -738,9 +681,6 @@ def test_empty_results_handling(langfuse_client):
738681
assert result.has_more_items is False
739682

740683

741-
@pytest.mark.skip(
742-
reason="Single Github CI runner overwhelmed by amount of scores created"
743-
)
744684
def test_max_items_zero(langfuse_client):
745685
"""Test with max_items=0 (should process no items)."""
746686
result = langfuse_client.run_batched_evaluation(
@@ -754,9 +694,6 @@ def test_max_items_zero(langfuse_client):
754694
assert result.total_items_processed == 0
755695

756696

757-
@pytest.mark.skip(
758-
reason="Single Github CI runner overwhelmed by amount of scores created"
759-
)
760697
def test_evaluation_value_type_conversions(langfuse_client):
761698
"""Test that different evaluation value types are handled correctly."""
762699

@@ -784,9 +721,6 @@ def multi_type_evaluator(*, input, output, **kwargs):
784721
# ============================================================================
785722

786723

787-
@pytest.mark.skip(
788-
reason="Single Github CI runner overwhelmed by amount of scores created"
789-
)
790724
def test_pagination_with_max_items(langfuse_client):
791725
"""Test that max_items limit is respected."""
792726
# Create more traces to ensure we have enough data
@@ -818,9 +752,6 @@ def test_pagination_with_max_items(langfuse_client):
818752
assert result.total_items_processed <= 5
819753

820754

821-
@pytest.mark.skip(
822-
reason="Single Github CI runner overwhelmed by amount of scores created"
823-
)
824755
def test_has_more_items_flag(langfuse_client):
825756
"""Test that has_more_items flag is set correctly when max_items is reached."""
826757
# Create enough traces to exceed max_items
@@ -852,9 +783,6 @@ def test_has_more_items_flag(langfuse_client):
852783
assert result.has_more_items is True
853784

854785

855-
@pytest.mark.skip(
856-
reason="Single Github CI runner overwhelmed by amount of scores created"
857-
)
858786
def test_fetch_batch_size_parameter(langfuse_client):
859787
"""Test that different fetch_batch_size values work correctly."""
860788
for batch_size in [1, 5, 10]:
@@ -875,9 +803,6 @@ def test_fetch_batch_size_parameter(langfuse_client):
875803
# ============================================================================
876804

877805

878-
@pytest.mark.skip(
879-
reason="Single Github CI runner overwhelmed by amount of scores created"
880-
)
881806
def test_resume_token_structure(langfuse_client):
882807
"""Test that BatchEvaluationResumeToken has correct structure."""
883808
resume_token = BatchEvaluationResumeToken(
@@ -900,9 +825,6 @@ def test_resume_token_structure(langfuse_client):
900825
# ============================================================================
901826

902827

903-
@pytest.mark.skip(
904-
reason="Single Github CI runner overwhelmed by amount of scores created"
905-
)
906828
def test_max_concurrency_parameter(langfuse_client):
907829
"""Test that max_concurrency parameter works correctly."""
908830
for concurrency in [1, 5, 10]:
@@ -923,9 +845,6 @@ def test_max_concurrency_parameter(langfuse_client):
923845
# ============================================================================
924846

925847

926-
@pytest.mark.skip(
927-
reason="Single Github CI runner overwhelmed by amount of scores created"
928-
)
929848
def test_evaluator_stats_structure(langfuse_client):
930849
"""Test that EvaluatorStats has correct structure."""
931850

@@ -956,9 +875,6 @@ def test_evaluator(*, input, output, **kwargs):
956875
assert stats.failed_runs == 0
957876

958877

959-
@pytest.mark.skip(
960-
reason="Single Github CI runner overwhelmed by amount of scores created"
961-
)
962878
def test_evaluator_stats_tracking(langfuse_client):
963879
"""Test that evaluator statistics are tracked correctly."""
964880

@@ -984,9 +900,6 @@ def sometimes_failing_evaluator(*, input, output, **kwargs):
984900
assert stats.successful_runs + stats.failed_runs == stats.total_runs
985901

986902

987-
@pytest.mark.skip(
988-
reason="Single Github CI runner overwhelmed by amount of scores created"
989-
)
990903
def test_error_summary_aggregation(langfuse_client):
991904
"""Test that error types are aggregated correctly in error_summary."""
992905

@@ -1005,9 +918,6 @@ def failing_mapper(*, item):
1005918
assert any("Error" in key for key in result.error_summary.keys())
1006919

1007920

1008-
@pytest.mark.skip(
1009-
reason="Single Github CI runner overwhelmed by amount of scores created"
1010-
)
1011921
def test_failed_item_ids_collected(langfuse_client):
1012922
"""Test that failed item IDs are collected."""
1013923

@@ -1031,9 +941,6 @@ def failing_mapper(*, item):
1031941
# ============================================================================
1032942

1033943

1034-
@pytest.mark.skip(
1035-
reason="Single Github CI runner overwhelmed by amount of scores created"
1036-
)
1037944
def test_duration_tracking(langfuse_client):
1038945
"""Test that duration is tracked correctly."""
1039946
result = langfuse_client.run_batched_evaluation(
@@ -1047,9 +954,6 @@ def test_duration_tracking(langfuse_client):
1047954
assert result.duration_seconds < 60 # Should complete quickly for small batch
1048955

1049956

1050-
@pytest.mark.skip(
1051-
reason="Single Github CI runner overwhelmed by amount of scores created"
1052-
)
1053957
def test_verbose_logging(langfuse_client):
1054958
"""Test that verbose=True doesn't cause errors."""
1055959
result = langfuse_client.run_batched_evaluation(
@@ -1068,9 +972,6 @@ def test_verbose_logging(langfuse_client):
1068972
# ============================================================================
1069973

1070974

1071-
@pytest.mark.skip(
1072-
reason="Single Github CI runner overwhelmed by amount of scores created"
1073-
)
1074975
def test_item_evaluations_basic(langfuse_client):
1075976
"""Test that item_evaluations dict contains correct structure."""
1076977

@@ -1100,9 +1001,6 @@ def test_evaluator(*, input, output, **kwargs):
11001001
assert evaluations[0].name == "test_metric"
11011002

11021003

1103-
@pytest.mark.skip(
1104-
reason="Single Github CI runner overwhelmed by amount of scores created"
1105-
)
11061004
def test_item_evaluations_multiple_evaluators(langfuse_client):
11071005
"""Test item_evaluations with multiple evaluators."""
11081006

@@ -1129,9 +1027,6 @@ def relevance_evaluator(*, input, output, **kwargs):
11291027
assert eval_names == {"accuracy", "relevance"}
11301028

11311029

1132-
@pytest.mark.skip(
1133-
reason="Single Github CI runner overwhelmed by amount of scores created"
1134-
)
11351030
def test_item_evaluations_with_composite(langfuse_client):
11361031
"""Test that item_evaluations includes composite evaluations."""
11371032

@@ -1164,9 +1059,6 @@ def composite_evaluator(*, input, output, expected_output, metadata, evaluations
11641059
assert result.total_composite_scores_created > 0
11651060

11661061

1167-
@pytest.mark.skip(
1168-
reason="Single Github CI runner overwhelmed by amount of scores created"
1169-
)
11701062
def test_item_evaluations_empty_on_failure(langfuse_client):
11711063
"""Test that failed items don't appear in item_evaluations."""
11721064

0 commit comments

Comments
 (0)