2525# ============================================================================
2626
2727
28+ pytestmark = pytest .mark .skip (reason = "Reason for skipping this file" )
29+
30+
2831@pytest .fixture
2932def langfuse_client ():
3033 """Get a Langfuse client for testing."""
@@ -90,9 +93,6 @@ def test_run_batched_evaluation_on_traces_basic(langfuse_client):
9093 assert stats .name == "simple_evaluator"
9194
9295
93- @pytest .mark .skip (
94- reason = "Single Github CI runner overwhelmed by amount of scores created"
95- )
9696def test_batch_evaluation_with_filter (langfuse_client ):
9797 """Test batch evaluation with JSON filter."""
9898 # Create a trace with specific tag
@@ -125,9 +125,6 @@ def test_batch_evaluation_with_filter(langfuse_client):
125125 assert result .completed is True
126126
127127
128- @pytest .mark .skip (
129- reason = "Single Github CI runner overwhelmed by amount of scores created"
130- )
131128def test_batch_evaluation_with_metadata (langfuse_client ):
132129 """Test that additional metadata is added to all scores."""
133130
@@ -162,9 +159,6 @@ def metadata_checking_evaluator(*, input, output, metadata=None, **kwargs):
162159 assert result .completed is True
163160
164161
165- @pytest .mark .skip (
166- reason = "Single Github CI runner overwhelmed by amount of scores created"
167- )
168162def test_result_structure_fields (langfuse_client ):
169163 """Test that BatchEvaluationResult has all expected fields."""
170164 result = langfuse_client .run_batched_evaluation (
@@ -204,9 +198,6 @@ def test_result_structure_fields(langfuse_client):
204198# ============================================================================
205199
206200
207- @pytest .mark .skip (
208- reason = "Single Github CI runner overwhelmed by amount of scores created"
209- )
210201def test_simple_mapper (langfuse_client ):
211202 """Test basic mapper functionality."""
212203
@@ -228,9 +219,6 @@ def custom_mapper(*, item):
228219 assert result .total_items_processed > 0
229220
230221
231- @pytest .mark .skip (
232- reason = "Single Github CI runner overwhelmed by amount of scores created"
233- )
234222@pytest .mark .asyncio
235223async def test_async_mapper (langfuse_client ):
236224 """Test that async mappers work correctly."""
@@ -255,9 +243,6 @@ async def async_mapper(*, item):
255243 assert result .total_items_processed > 0
256244
257245
258- @pytest .mark .skip (
259- reason = "Single Github CI runner overwhelmed by amount of scores created"
260- )
261246def test_mapper_failure_handling (langfuse_client ):
262247 """Test that mapper failures cause items to be skipped."""
263248
@@ -277,9 +262,6 @@ def failing_mapper(*, item):
277262 assert "ValueError" in result .error_summary or "Exception" in result .error_summary
278263
279264
280- @pytest .mark .skip (
281- reason = "Single Github CI runner overwhelmed by amount of scores created"
282- )
283265def test_mapper_with_missing_fields (langfuse_client ):
284266 """Test mapper handles traces with missing fields gracefully."""
285267
@@ -310,9 +292,6 @@ def robust_mapper(*, item):
310292# ============================================================================
311293
312294
313- @pytest .mark .skip (
314- reason = "Single Github CI runner overwhelmed by amount of scores created"
315- )
316295def test_single_evaluator (langfuse_client ):
317296 """Test with a single evaluator."""
318297
@@ -331,9 +310,6 @@ def quality_evaluator(*, input, output, **kwargs):
331310 assert result .evaluator_stats [0 ].name == "quality_evaluator"
332311
333312
334- @pytest .mark .skip (
335- reason = "Single Github CI runner overwhelmed by amount of scores created"
336- )
337313def test_multiple_evaluators (langfuse_client ):
338314 """Test with multiple evaluators running in parallel."""
339315
@@ -358,9 +334,6 @@ def safety_evaluator(*, input, output, **kwargs):
358334 assert result .total_scores_created >= result .total_items_processed * 3
359335
360336
361- @pytest .mark .skip (
362- reason = "Single Github CI runner overwhelmed by amount of scores created"
363- )
364337@pytest .mark .asyncio
365338async def test_async_evaluator (langfuse_client ):
366339 """Test that async evaluators work correctly."""
@@ -400,9 +373,6 @@ def multi_score_evaluator(*, input, output, **kwargs):
400373 assert result .total_scores_created >= result .total_items_processed * 3
401374
402375
403- @pytest .mark .skip (
404- reason = "Single Github CI runner overwhelmed by amount of scores created"
405- )
406376def test_evaluator_failure_statistics (langfuse_client ):
407377 """Test that evaluator failures are tracked in statistics."""
408378
@@ -438,9 +408,6 @@ def failing_evaluator(*, input, output, **kwargs):
438408 assert result .total_evaluations_failed > 0
439409
440410
441- @pytest .mark .skip (
442- reason = "Single Github CI runner overwhelmed by amount of scores created"
443- )
444411def test_mixed_sync_async_evaluators (langfuse_client ):
445412 """Test mixing synchronous and asynchronous evaluators."""
446413
@@ -504,9 +471,6 @@ def composite_evaluator(*, input, output, expected_output, metadata, evaluations
504471 assert result .total_scores_created > result .total_composite_scores_created
505472
506473
507- @pytest .mark .skip (
508- reason = "Single Github CI runner overwhelmed by amount of scores created"
509- )
510474def test_composite_evaluator_pass_fail (langfuse_client ):
511475 """Test composite evaluator that implements pass/fail logic."""
512476
@@ -542,9 +506,6 @@ def pass_fail_composite(*, input, output, expected_output, metadata, evaluations
542506 assert result .total_composite_scores_created > 0
543507
544508
545- @pytest .mark .skip (
546- reason = "Single Github CI runner overwhelmed by amount of scores created"
547- )
548509@pytest .mark .asyncio
549510async def test_async_composite_evaluator (langfuse_client ):
550511 """Test async composite evaluator."""
@@ -570,9 +531,6 @@ async def async_composite(*, input, output, expected_output, metadata, evaluatio
570531 assert result .total_composite_scores_created > 0
571532
572533
573- @pytest .mark .skip (
574- reason = "Single Github CI runner overwhelmed by amount of scores created"
575- )
576534def test_composite_evaluator_with_no_evaluations (langfuse_client ):
577535 """Test composite evaluator when no evaluations are present."""
578536
@@ -595,9 +553,6 @@ def composite_evaluator(*, input, output, expected_output, metadata, evaluations
595553 assert result .total_composite_scores_created == 0
596554
597555
598- @pytest .mark .skip (
599- reason = "Single Github CI runner overwhelmed by amount of scores created"
600- )
601556def test_composite_evaluator_failure_handling (langfuse_client ):
602557 """Test that composite evaluator failures are handled gracefully."""
603558
@@ -626,9 +581,6 @@ def failing_composite(*, input, output, expected_output, metadata, evaluations):
626581# ============================================================================
627582
628583
629- @pytest .mark .skip (
630- reason = "Single Github CI runner overwhelmed by amount of scores created"
631- )
632584def test_mapper_failure_skips_item (langfuse_client ):
633585 """Test that mapper failure causes item to be skipped."""
634586
@@ -652,9 +604,6 @@ def sometimes_failing_mapper(*, item):
652604 assert result .total_items_processed > 0
653605
654606
655- @pytest .mark .skip (
656- reason = "Single Github CI runner overwhelmed by amount of scores created"
657- )
658607def test_evaluator_failure_continues (langfuse_client ):
659608 """Test that one evaluator failing doesn't stop others."""
660609
@@ -684,9 +633,6 @@ def working_evaluator2(*, input, output, **kwargs):
684633 assert failing_stats .failed_runs > 0
685634
686635
687- @pytest .mark .skip (
688- reason = "Single Github CI runner overwhelmed by amount of scores created"
689- )
690636def test_all_evaluators_fail (langfuse_client ):
691637 """Test when all evaluators fail but item is still processed."""
692638
@@ -716,9 +662,6 @@ def failing_evaluator2(*, input, output, **kwargs):
716662# ============================================================================
717663
718664
719- @pytest .mark .skip (
720- reason = "Single Github CI runner overwhelmed by amount of scores created"
721- )
722665def test_empty_results_handling (langfuse_client ):
723666 """Test batch evaluation when filter returns no items."""
724667 nonexistent_name = f"nonexistent-trace-{ create_uuid ()} "
@@ -738,9 +681,6 @@ def test_empty_results_handling(langfuse_client):
738681 assert result .has_more_items is False
739682
740683
741- @pytest .mark .skip (
742- reason = "Single Github CI runner overwhelmed by amount of scores created"
743- )
744684def test_max_items_zero (langfuse_client ):
745685 """Test with max_items=0 (should process no items)."""
746686 result = langfuse_client .run_batched_evaluation (
@@ -754,9 +694,6 @@ def test_max_items_zero(langfuse_client):
754694 assert result .total_items_processed == 0
755695
756696
757- @pytest .mark .skip (
758- reason = "Single Github CI runner overwhelmed by amount of scores created"
759- )
760697def test_evaluation_value_type_conversions (langfuse_client ):
761698 """Test that different evaluation value types are handled correctly."""
762699
@@ -784,9 +721,6 @@ def multi_type_evaluator(*, input, output, **kwargs):
784721# ============================================================================
785722
786723
787- @pytest .mark .skip (
788- reason = "Single Github CI runner overwhelmed by amount of scores created"
789- )
790724def test_pagination_with_max_items (langfuse_client ):
791725 """Test that max_items limit is respected."""
792726 # Create more traces to ensure we have enough data
@@ -818,9 +752,6 @@ def test_pagination_with_max_items(langfuse_client):
818752 assert result .total_items_processed <= 5
819753
820754
821- @pytest .mark .skip (
822- reason = "Single Github CI runner overwhelmed by amount of scores created"
823- )
824755def test_has_more_items_flag (langfuse_client ):
825756 """Test that has_more_items flag is set correctly when max_items is reached."""
826757 # Create enough traces to exceed max_items
@@ -852,9 +783,6 @@ def test_has_more_items_flag(langfuse_client):
852783 assert result .has_more_items is True
853784
854785
855- @pytest .mark .skip (
856- reason = "Single Github CI runner overwhelmed by amount of scores created"
857- )
858786def test_fetch_batch_size_parameter (langfuse_client ):
859787 """Test that different fetch_batch_size values work correctly."""
860788 for batch_size in [1 , 5 , 10 ]:
@@ -875,9 +803,6 @@ def test_fetch_batch_size_parameter(langfuse_client):
875803# ============================================================================
876804
877805
878- @pytest .mark .skip (
879- reason = "Single Github CI runner overwhelmed by amount of scores created"
880- )
881806def test_resume_token_structure (langfuse_client ):
882807 """Test that BatchEvaluationResumeToken has correct structure."""
883808 resume_token = BatchEvaluationResumeToken (
@@ -900,9 +825,6 @@ def test_resume_token_structure(langfuse_client):
900825# ============================================================================
901826
902827
903- @pytest .mark .skip (
904- reason = "Single Github CI runner overwhelmed by amount of scores created"
905- )
906828def test_max_concurrency_parameter (langfuse_client ):
907829 """Test that max_concurrency parameter works correctly."""
908830 for concurrency in [1 , 5 , 10 ]:
@@ -923,9 +845,6 @@ def test_max_concurrency_parameter(langfuse_client):
923845# ============================================================================
924846
925847
926- @pytest .mark .skip (
927- reason = "Single Github CI runner overwhelmed by amount of scores created"
928- )
929848def test_evaluator_stats_structure (langfuse_client ):
930849 """Test that EvaluatorStats has correct structure."""
931850
@@ -956,9 +875,6 @@ def test_evaluator(*, input, output, **kwargs):
956875 assert stats .failed_runs == 0
957876
958877
959- @pytest .mark .skip (
960- reason = "Single Github CI runner overwhelmed by amount of scores created"
961- )
962878def test_evaluator_stats_tracking (langfuse_client ):
963879 """Test that evaluator statistics are tracked correctly."""
964880
@@ -984,9 +900,6 @@ def sometimes_failing_evaluator(*, input, output, **kwargs):
984900 assert stats .successful_runs + stats .failed_runs == stats .total_runs
985901
986902
987- @pytest .mark .skip (
988- reason = "Single Github CI runner overwhelmed by amount of scores created"
989- )
990903def test_error_summary_aggregation (langfuse_client ):
991904 """Test that error types are aggregated correctly in error_summary."""
992905
@@ -1005,9 +918,6 @@ def failing_mapper(*, item):
1005918 assert any ("Error" in key for key in result .error_summary .keys ())
1006919
1007920
1008- @pytest .mark .skip (
1009- reason = "Single Github CI runner overwhelmed by amount of scores created"
1010- )
1011921def test_failed_item_ids_collected (langfuse_client ):
1012922 """Test that failed item IDs are collected."""
1013923
@@ -1031,9 +941,6 @@ def failing_mapper(*, item):
1031941# ============================================================================
1032942
1033943
1034- @pytest .mark .skip (
1035- reason = "Single Github CI runner overwhelmed by amount of scores created"
1036- )
1037944def test_duration_tracking (langfuse_client ):
1038945 """Test that duration is tracked correctly."""
1039946 result = langfuse_client .run_batched_evaluation (
@@ -1047,9 +954,6 @@ def test_duration_tracking(langfuse_client):
1047954 assert result .duration_seconds < 60 # Should complete quickly for small batch
1048955
1049956
1050- @pytest .mark .skip (
1051- reason = "Single Github CI runner overwhelmed by amount of scores created"
1052- )
1053957def test_verbose_logging (langfuse_client ):
1054958 """Test that verbose=True doesn't cause errors."""
1055959 result = langfuse_client .run_batched_evaluation (
@@ -1068,9 +972,6 @@ def test_verbose_logging(langfuse_client):
1068972# ============================================================================
1069973
1070974
1071- @pytest .mark .skip (
1072- reason = "Single Github CI runner overwhelmed by amount of scores created"
1073- )
1074975def test_item_evaluations_basic (langfuse_client ):
1075976 """Test that item_evaluations dict contains correct structure."""
1076977
@@ -1100,9 +1001,6 @@ def test_evaluator(*, input, output, **kwargs):
11001001 assert evaluations [0 ].name == "test_metric"
11011002
11021003
1103- @pytest .mark .skip (
1104- reason = "Single Github CI runner overwhelmed by amount of scores created"
1105- )
11061004def test_item_evaluations_multiple_evaluators (langfuse_client ):
11071005 """Test item_evaluations with multiple evaluators."""
11081006
@@ -1129,9 +1027,6 @@ def relevance_evaluator(*, input, output, **kwargs):
11291027 assert eval_names == {"accuracy" , "relevance" }
11301028
11311029
1132- @pytest .mark .skip (
1133- reason = "Single Github CI runner overwhelmed by amount of scores created"
1134- )
11351030def test_item_evaluations_with_composite (langfuse_client ):
11361031 """Test that item_evaluations includes composite evaluations."""
11371032
@@ -1164,9 +1059,6 @@ def composite_evaluator(*, input, output, expected_output, metadata, evaluations
11641059 assert result .total_composite_scores_created > 0
11651060
11661061
1167- @pytest .mark .skip (
1168- reason = "Single Github CI runner overwhelmed by amount of scores created"
1169- )
11701062def test_item_evaluations_empty_on_failure (langfuse_client ):
11711063 """Test that failed items don't appear in item_evaluations."""
11721064
0 commit comments