@@ -853,147 +853,3 @@ def composite_evaluator(*, input, output, expected_output, metadata, evaluations
853853 )
854854 expected_value = 0.8 * 0.6 + 0.9 * 0.4 # 0.84
855855 assert abs (composite_eval .value - expected_value ) < 0.001
856-
857-
858- def test_experiment_composite_evaluator_pass_fail ():
859- """Test composite evaluator in experiments that implements pass/fail logic."""
860-
861- langfuse_client = get_client ()
862-
863- def metric1_evaluator (* , input , output , ** kwargs ):
864- return Evaluation (name = "metric1" , value = 0.9 )
865-
866- def metric2_evaluator (* , input , output , ** kwargs ):
867- return Evaluation (name = "metric2" , value = 0.7 )
868-
869- def pass_fail_composite (* , input , output , expected_output , metadata , evaluations ):
870- thresholds = {"metric1" : 0.8 , "metric2" : 0.6 }
871-
872- passes = all (
873- e .value >= thresholds .get (e .name , 0 )
874- for e in evaluations
875- if isinstance (e .value , (int , float ))
876- )
877-
878- return Evaluation (
879- name = "passes_all_checks" ,
880- value = 1.0 if passes else 0.0 ,
881- comment = "All checks passed" if passes else "Some checks failed" ,
882- )
883-
884- data = [{"input" : "Test 1" }]
885-
886- result = langfuse_client .run_experiment (
887- name = f"Pass/Fail Composite Test { create_uuid ()} " ,
888- data = data ,
889- task = mock_task ,
890- evaluators = [metric1_evaluator , metric2_evaluator ],
891- composite_evaluator = pass_fail_composite ,
892- )
893-
894- # Verify composite evaluation
895- assert len (result .item_results ) == 1
896- item_result = result .item_results [0 ]
897- assert len (item_result .evaluations ) == 3
898-
899- composite_eval = next (
900- e for e in item_result .evaluations if e .name == "passes_all_checks"
901- )
902- assert composite_eval .value == 1.0
903- assert composite_eval .comment == "All checks passed"
904-
905-
906- @pytest .mark .asyncio
907- async def test_experiment_async_composite_evaluator ():
908- """Test async composite evaluator in experiments."""
909- import asyncio
910-
911- langfuse_client = get_client ()
912-
913- def evaluator1 (* , input , output , ** kwargs ):
914- return Evaluation (name = "eval1" , value = 0.8 )
915-
916- async def async_composite (* , input , output , expected_output , metadata , evaluations ):
917- await asyncio .sleep (0.01 ) # Simulate async processing
918- avg = sum (
919- e .value for e in evaluations if isinstance (e .value , (int , float ))
920- ) / len (evaluations )
921- return Evaluation (name = "async_composite" , value = avg )
922-
923- data = [{"input" : "Test 1" }]
924-
925- result = langfuse_client .run_experiment (
926- name = f"Async Composite Test { create_uuid ()} " ,
927- data = data ,
928- task = mock_task ,
929- evaluators = [evaluator1 ],
930- composite_evaluator = async_composite ,
931- )
932-
933- # Verify async composite evaluation
934- assert len (result .item_results ) == 1
935- item_result = result .item_results [0 ]
936- assert len (item_result .evaluations ) == 2
937-
938- composite_eval = next (
939- e for e in item_result .evaluations if e .name == "async_composite"
940- )
941- assert composite_eval .value == 0.8
942-
943-
944- def test_experiment_composite_evaluator_with_no_evaluations ():
945- """Test composite evaluator in experiments when no evaluations are present."""
946-
947- langfuse_client = get_client ()
948-
949- def always_failing_evaluator (* , input , output , ** kwargs ):
950- raise Exception ("Always fails" )
951-
952- def composite_evaluator (* , input , output , expected_output , metadata , evaluations ):
953- # Should not be called if no evaluations succeed
954- return Evaluation (name = "composite" , value = 0.0 )
955-
956- data = [{"input" : "Test 1" }]
957-
958- result = langfuse_client .run_experiment (
959- name = f"No Evals Composite Test { create_uuid ()} " ,
960- data = data ,
961- task = mock_task ,
962- evaluators = [always_failing_evaluator ],
963- composite_evaluator = composite_evaluator ,
964- )
965-
966- # Composite evaluator should not run if no evaluations
967- assert len (result .item_results ) == 1
968- item_result = result .item_results [0 ]
969- # Should have no evaluations since the evaluator failed
970- assert len (item_result .evaluations ) == 0
971-
972-
973- def test_experiment_composite_evaluator_failure_handling ():
974- """Test that composite evaluator failures are handled gracefully in experiments."""
975-
976- langfuse_client = get_client ()
977-
978- def evaluator1 (* , input , output , ** kwargs ):
979- return Evaluation (name = "eval1" , value = 0.8 )
980-
981- def failing_composite (* , input , output , expected_output , metadata , evaluations ):
982- raise ValueError ("Composite evaluator failed" )
983-
984- data = [{"input" : "Test 1" }]
985-
986- result = langfuse_client .run_experiment (
987- name = f"Failing Composite Test { create_uuid ()} " ,
988- data = data ,
989- task = mock_task ,
990- evaluators = [evaluator1 ],
991- composite_evaluator = failing_composite ,
992- )
993-
994- # Regular evaluations should still be present
995- assert len (result .item_results ) == 1
996- item_result = result .item_results [0 ]
997- # Should only have the regular evaluation, not the composite one
998- assert len (item_result .evaluations ) == 1
999- assert item_result .evaluations [0 ].name == "eval1"
0 commit comments