Skip to content

Commit 31beb2b

Browse files
committed
push
1 parent 48ed142 commit 31beb2b

1 file changed

Lines changed: 0 additions & 144 deletions

File tree

tests/test_experiments.py

Lines changed: 0 additions & 144 deletions
Original file line numberDiff line numberDiff line change
@@ -853,147 +853,3 @@ def composite_evaluator(*, input, output, expected_output, metadata, evaluations
853853
)
854854
expected_value = 0.8 * 0.6 + 0.9 * 0.4 # 0.84
855855
assert abs(composite_eval.value - expected_value) < 0.001
856-
857-
858-
def test_experiment_composite_evaluator_pass_fail():
859-
"""Test composite evaluator in experiments that implements pass/fail logic."""
860-
861-
langfuse_client = get_client()
862-
863-
def metric1_evaluator(*, input, output, **kwargs):
864-
return Evaluation(name="metric1", value=0.9)
865-
866-
def metric2_evaluator(*, input, output, **kwargs):
867-
return Evaluation(name="metric2", value=0.7)
868-
869-
def pass_fail_composite(*, input, output, expected_output, metadata, evaluations):
870-
thresholds = {"metric1": 0.8, "metric2": 0.6}
871-
872-
passes = all(
873-
e.value >= thresholds.get(e.name, 0)
874-
for e in evaluations
875-
if isinstance(e.value, (int, float))
876-
)
877-
878-
return Evaluation(
879-
name="passes_all_checks",
880-
value=1.0 if passes else 0.0,
881-
comment="All checks passed" if passes else "Some checks failed",
882-
)
883-
884-
data = [{"input": "Test 1"}]
885-
886-
result = langfuse_client.run_experiment(
887-
name=f"Pass/Fail Composite Test {create_uuid()}",
888-
data=data,
889-
task=mock_task,
890-
evaluators=[metric1_evaluator, metric2_evaluator],
891-
composite_evaluator=pass_fail_composite,
892-
)
893-
894-
# Verify composite evaluation
895-
assert len(result.item_results) == 1
896-
item_result = result.item_results[0]
897-
assert len(item_result.evaluations) == 3
898-
899-
composite_eval = next(
900-
e for e in item_result.evaluations if e.name == "passes_all_checks"
901-
)
902-
assert composite_eval.value == 1.0
903-
assert composite_eval.comment == "All checks passed"
904-
905-
906-
@pytest.mark.asyncio
907-
async def test_experiment_async_composite_evaluator():
908-
"""Test async composite evaluator in experiments."""
909-
import asyncio
910-
911-
langfuse_client = get_client()
912-
913-
def evaluator1(*, input, output, **kwargs):
914-
return Evaluation(name="eval1", value=0.8)
915-
916-
async def async_composite(*, input, output, expected_output, metadata, evaluations):
917-
await asyncio.sleep(0.01) # Simulate async processing
918-
avg = sum(
919-
e.value for e in evaluations if isinstance(e.value, (int, float))
920-
) / len(evaluations)
921-
return Evaluation(name="async_composite", value=avg)
922-
923-
data = [{"input": "Test 1"}]
924-
925-
result = langfuse_client.run_experiment(
926-
name=f"Async Composite Test {create_uuid()}",
927-
data=data,
928-
task=mock_task,
929-
evaluators=[evaluator1],
930-
composite_evaluator=async_composite,
931-
)
932-
933-
# Verify async composite evaluation
934-
assert len(result.item_results) == 1
935-
item_result = result.item_results[0]
936-
assert len(item_result.evaluations) == 2
937-
938-
composite_eval = next(
939-
e for e in item_result.evaluations if e.name == "async_composite"
940-
)
941-
assert composite_eval.value == 0.8
942-
943-
944-
def test_experiment_composite_evaluator_with_no_evaluations():
945-
"""Test composite evaluator in experiments when no evaluations are present."""
946-
947-
langfuse_client = get_client()
948-
949-
def always_failing_evaluator(*, input, output, **kwargs):
950-
raise Exception("Always fails")
951-
952-
def composite_evaluator(*, input, output, expected_output, metadata, evaluations):
953-
# Should not be called if no evaluations succeed
954-
return Evaluation(name="composite", value=0.0)
955-
956-
data = [{"input": "Test 1"}]
957-
958-
result = langfuse_client.run_experiment(
959-
name=f"No Evals Composite Test {create_uuid()}",
960-
data=data,
961-
task=mock_task,
962-
evaluators=[always_failing_evaluator],
963-
composite_evaluator=composite_evaluator,
964-
)
965-
966-
# Composite evaluator should not run if no evaluations
967-
assert len(result.item_results) == 1
968-
item_result = result.item_results[0]
969-
# Should have no evaluations since the evaluator failed
970-
assert len(item_result.evaluations) == 0
971-
972-
973-
def test_experiment_composite_evaluator_failure_handling():
974-
"""Test that composite evaluator failures are handled gracefully in experiments."""
975-
976-
langfuse_client = get_client()
977-
978-
def evaluator1(*, input, output, **kwargs):
979-
return Evaluation(name="eval1", value=0.8)
980-
981-
def failing_composite(*, input, output, expected_output, metadata, evaluations):
982-
raise ValueError("Composite evaluator failed")
983-
984-
data = [{"input": "Test 1"}]
985-
986-
result = langfuse_client.run_experiment(
987-
name=f"Failing Composite Test {create_uuid()}",
988-
data=data,
989-
task=mock_task,
990-
evaluators=[evaluator1],
991-
composite_evaluator=failing_composite,
992-
)
993-
994-
# Regular evaluations should still be present
995-
assert len(result.item_results) == 1
996-
item_result = result.item_results[0]
997-
# Should only have the regular evaluation, not the composite one
998-
assert len(item_result.evaluations) == 1
999-
assert item_result.evaluations[0].name == "eval1"

0 commit comments

Comments
 (0)