push

hassiebp · hassiebp · commit 31beb2bf468a · 2025-11-14T12:48:30.000+01:00
diff --git a/tests/test_experiments.py b/tests/test_experiments.py
@@ -853,147 +853,3 @@ def composite_evaluator(*, input, output, expected_output, metadata, evaluations
         )
         expected_value = 0.8 * 0.6 + 0.9 * 0.4  # 0.84
         assert abs(composite_eval.value - expected_value) < 0.001
-
-
-def test_experiment_composite_evaluator_pass_fail():
-    """Test composite evaluator in experiments that implements pass/fail logic."""
-
-    langfuse_client = get_client()
-
-    def metric1_evaluator(*, input, output, **kwargs):
-        return Evaluation(name="metric1", value=0.9)
-
-    def metric2_evaluator(*, input, output, **kwargs):
-        return Evaluation(name="metric2", value=0.7)
-
-    def pass_fail_composite(*, input, output, expected_output, metadata, evaluations):
-        thresholds = {"metric1": 0.8, "metric2": 0.6}
-
-        passes = all(
-            e.value >= thresholds.get(e.name, 0)
-            for e in evaluations
-            if isinstance(e.value, (int, float))
-        )
-
-        return Evaluation(
-            name="passes_all_checks",
-            value=1.0 if passes else 0.0,
-            comment="All checks passed" if passes else "Some checks failed",
-        )
-
-    data = [{"input": "Test 1"}]
-
-    result = langfuse_client.run_experiment(
-        name=f"Pass/Fail Composite Test {create_uuid()}",
-        data=data,
-        task=mock_task,
-        evaluators=[metric1_evaluator, metric2_evaluator],
-        composite_evaluator=pass_fail_composite,
-    )
-
-    # Verify composite evaluation
-    assert len(result.item_results) == 1
-    item_result = result.item_results[0]
-    assert len(item_result.evaluations) == 3
-
-    composite_eval = next(
-        e for e in item_result.evaluations if e.name == "passes_all_checks"
-    )
-    assert composite_eval.value == 1.0
-    assert composite_eval.comment == "All checks passed"
-
-
-@pytest.mark.asyncio
-async def test_experiment_async_composite_evaluator():
-    """Test async composite evaluator in experiments."""
-    import asyncio
-
-    langfuse_client = get_client()
-
-    def evaluator1(*, input, output, **kwargs):
-        return Evaluation(name="eval1", value=0.8)
-
-    async def async_composite(*, input, output, expected_output, metadata, evaluations):
-        await asyncio.sleep(0.01)  # Simulate async processing
-        avg = sum(
-            e.value for e in evaluations if isinstance(e.value, (int, float))
-        ) / len(evaluations)
-        return Evaluation(name="async_composite", value=avg)
-
-    data = [{"input": "Test 1"}]
-
-    result = langfuse_client.run_experiment(
-        name=f"Async Composite Test {create_uuid()}",
-        data=data,
-        task=mock_task,
-        evaluators=[evaluator1],
-        composite_evaluator=async_composite,
-    )
-
-    # Verify async composite evaluation
-    assert len(result.item_results) == 1
-    item_result = result.item_results[0]
-    assert len(item_result.evaluations) == 2
-
-    composite_eval = next(
-        e for e in item_result.evaluations if e.name == "async_composite"
-    )
-    assert composite_eval.value == 0.8
-
-
-def test_experiment_composite_evaluator_with_no_evaluations():
-    """Test composite evaluator in experiments when no evaluations are present."""
-
-    langfuse_client = get_client()
-
-    def always_failing_evaluator(*, input, output, **kwargs):
-        raise Exception("Always fails")
-
-    def composite_evaluator(*, input, output, expected_output, metadata, evaluations):
-        # Should not be called if no evaluations succeed
-        return Evaluation(name="composite", value=0.0)
-
-    data = [{"input": "Test 1"}]
-
-    result = langfuse_client.run_experiment(
-        name=f"No Evals Composite Test {create_uuid()}",
-        data=data,
-        task=mock_task,
-        evaluators=[always_failing_evaluator],
-        composite_evaluator=composite_evaluator,
-    )
-
-    # Composite evaluator should not run if no evaluations
-    assert len(result.item_results) == 1
-    item_result = result.item_results[0]
-    # Should have no evaluations since the evaluator failed
-    assert len(item_result.evaluations) == 0
-
-
-def test_experiment_composite_evaluator_failure_handling():
-    """Test that composite evaluator failures are handled gracefully in experiments."""
-
-    langfuse_client = get_client()
-
-    def evaluator1(*, input, output, **kwargs):
-        return Evaluation(name="eval1", value=0.8)
-
-    def failing_composite(*, input, output, expected_output, metadata, evaluations):
-        raise ValueError("Composite evaluator failed")
-
-    data = [{"input": "Test 1"}]
-
-    result = langfuse_client.run_experiment(
-        name=f"Failing Composite Test {create_uuid()}",
-        data=data,
-        task=mock_task,
-        evaluators=[evaluator1],
-        composite_evaluator=failing_composite,
-    )
-
-    # Regular evaluations should still be present
-    assert len(result.item_results) == 1
-    item_result = result.item_results[0]
-    # Should only have the regular evaluation, not the composite one
-    assert len(item_result.evaluations) == 1
-    assert item_result.evaluations[0].name == "eval1"