langfuse
diff --git a/‎.pre-commit-config.yaml‎
Lines changed: 2 additions & 1 deletion b/‎.pre-commit-config.yaml‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎langfuse/_client/client.py‎
Lines changed: 61 additions & 1 deletion b/‎langfuse/_client/client.py‎
Lines changed: 61 additions & 1 deletion
diff --git a/‎langfuse/_client/datasets.py‎
Lines changed: 7 additions & 0 deletions b/‎langfuse/_client/datasets.py‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎langfuse/_client/observe.py‎
Lines changed: 6 additions & 2 deletions b/‎langfuse/_client/observe.py‎
Lines changed: 6 additions & 2 deletions
diff --git a/‎langfuse/batch_evaluation.py‎
Lines changed: 37 additions & 15 deletions b/‎langfuse/batch_evaluation.py‎
Lines changed: 37 additions & 15 deletions
diff --git a/‎tests/test_batch_evaluation.py‎
Lines changed: 5 additions & 5 deletions b/‎tests/test_batch_evaluation.py‎
Lines changed: 5 additions & 5 deletions
@@ -1,6 +1,6 @@
 repos:
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.3.2
+    rev: v0.14.4
     hooks:
       # Run the linter and fix
       - id: ruff
@@ -10,6 +10,7 @@ repos:
       # Run the formatter.
       - id: ruff-format
         types_or: [python, pyi, jupyter]
+        args: [--config=ci.ruff.toml]
 
   - repo: https://github.com/pre-commit/mirrors-mypy
     rev: v1.18.2
 
@@ -2465,6 +2465,7 @@ def run_experiment(
         data: ExperimentData,
         task: TaskFunction,
         evaluators: List[EvaluatorFunction] = [],
+        composite_evaluator: Optional[CompositeEvaluatorFunction] = None,
         run_evaluators: List[RunEvaluatorFunction] = [],
         max_concurrency: int = 50,
         metadata: Optional[Dict[str, str]] = None,
@@ -2500,6 +2501,10 @@ def run_experiment(
             evaluators: List of functions to evaluate each item's output individually.
                 Each evaluator receives input, output, expected_output, and metadata.
                 Can return single Evaluation dict or list of Evaluation dicts.
+            composite_evaluator: Optional function that creates composite scores from item-level evaluations.
+                Receives the same inputs as item-level evaluators (input, output, expected_output, metadata)
+                plus the list of evaluations from item-level evaluators. Useful for weighted averages,
+                pass/fail decisions based on multiple criteria, or custom scoring logic combining multiple metrics.
             run_evaluators: List of functions to evaluate the entire experiment run.
                 Each run evaluator receives all item_results and can compute aggregate metrics.
                 Useful for calculating averages, distributions, or cross-item comparisons.
@@ -2637,6 +2642,7 @@ def average_accuracy(*, item_results, **kwargs):
                     data=data,
                     task=task,
                     evaluators=evaluators or [],
+                    composite_evaluator=composite_evaluator,
                     run_evaluators=run_evaluators or [],
                     max_concurrency=max_concurrency,
                     metadata=metadata,
@@ -2653,6 +2659,7 @@ async def _run_experiment_async(
         data: ExperimentData,
         task: TaskFunction,
         evaluators: List[EvaluatorFunction],
+        composite_evaluator: Optional[CompositeEvaluatorFunction],
         run_evaluators: List[RunEvaluatorFunction],
         max_concurrency: int,
         metadata: Optional[Dict[str, Any]] = None,
@@ -2668,7 +2675,14 @@ async def _run_experiment_async(
         async def process_item(item: ExperimentItem) -> ExperimentItemResult:
             async with semaphore:
                 return await self._process_experiment_item(
-                    item, task, evaluators, name, run_name, description, metadata
+                    item,
+                    task,
+                    evaluators,
+                    composite_evaluator,
+                    name,
+                    run_name,
+                    description,
+                    metadata,
                 )
 
         # Run all items concurrently
@@ -2750,6 +2764,7 @@ async def _process_experiment_item(
         item: ExperimentItem,
         task: Callable,
         evaluators: List[Callable],
+        composite_evaluator: Optional[CompositeEvaluatorFunction],
         experiment_name: str,
         experiment_run_name: str,
         experiment_description: Optional[str],
@@ -2908,6 +2923,51 @@ async def _process_experiment_item(
                 except Exception as e:
                     langfuse_logger.error(f"Evaluator failed: {e}")
 
+            # Run composite evaluator if provided and we have evaluations
+            if composite_evaluator and evaluations:
+                try:
+                    composite_eval_metadata: Optional[Dict[str, Any]] = None
+                    if isinstance(item, dict):
+                        composite_eval_metadata = item.get("metadata")
+                    elif hasattr(item, "metadata"):
+                        composite_eval_metadata = item.metadata
+
+                    result = composite_evaluator(
+                        input=input_data,
+                        output=output,
+                        expected_output=expected_output,
+                        metadata=composite_eval_metadata,
+                        evaluations=evaluations,
+                    )
+
+                    # Handle async composite evaluators
+                    if asyncio.iscoroutine(result):
+                        result = await result
+
+                    # Normalize to list
+                    composite_evals: List[Evaluation] = []
+                    if isinstance(result, (dict, Evaluation)):
+                        composite_evals = [result]  # type: ignore
+                    elif isinstance(result, list):
+                        composite_evals = result  # type: ignore
+
+                    # Store composite evaluations as scores and add to evaluations list
+                    for composite_evaluation in composite_evals:
+                        self.create_score(
+                            trace_id=trace_id,
+                            observation_id=span.id,
+                            name=composite_evaluation.name,
+                            value=composite_evaluation.value,  # type: ignore
+                            comment=composite_evaluation.comment,
+                            metadata=composite_evaluation.metadata,
+                            config_id=composite_evaluation.config_id,
+                            data_type=composite_evaluation.data_type,  # type: ignore
+                        )
+                        evaluations.append(composite_evaluation)
+
+                except Exception as e:
+                    langfuse_logger.error(f"Composite evaluator failed: {e}")
+
             return ExperimentItemResult(
                 item=item,
                 output=output,
 
@@ -4,6 +4,7 @@
 
 from opentelemetry.util._decorator import _agnosticcontextmanager
 
+from langfuse.batch_evaluation import CompositeEvaluatorFunction
 from langfuse.experiment import (
     EvaluatorFunction,
     ExperimentResult,
@@ -204,6 +205,7 @@ def run_experiment(
         description: Optional[str] = None,
         task: TaskFunction,
         evaluators: List[EvaluatorFunction] = [],
+        composite_evaluator: Optional[CompositeEvaluatorFunction] = None,
         run_evaluators: List[RunEvaluatorFunction] = [],
         max_concurrency: int = 50,
         metadata: Optional[Dict[str, Any]] = None,
@@ -234,6 +236,10 @@ def run_experiment(
                 .metadata attributes. Signature should be: task(*, item, **kwargs) -> Any
             evaluators: List of functions to evaluate each item's output individually.
                 These will have access to the item's expected_output for comparison.
+            composite_evaluator: Optional function that creates composite scores from item-level evaluations.
+                Receives the same inputs as item-level evaluators (input, output, expected_output, metadata)
+                plus the list of evaluations from item-level evaluators. Useful for weighted averages,
+                pass/fail decisions based on multiple criteria, or custom scoring logic combining multiple metrics.
             run_evaluators: List of functions to evaluate the entire experiment run.
                 Useful for computing aggregate statistics across all dataset items.
             max_concurrency: Maximum number of concurrent task executions (default: 50).
@@ -411,6 +417,7 @@ def content_diversity(*, item_results, **kwargs):
             data=self.items,
             task=task,
             evaluators=evaluators,
+            composite_evaluator=composite_evaluator,
             run_evaluators=run_evaluators,
             max_concurrency=max_concurrency,
             metadata=metadata,
 
@@ -589,7 +589,9 @@ def __next__(self) -> Any:
             raise  # Re-raise StopIteration
 
         except Exception as e:
-            self.span.update(level="ERROR", status_message=str(e) or type(e).__name__).end()
+            self.span.update(
+                level="ERROR", status_message=str(e) or type(e).__name__
+            ).end()
 
             raise
 
@@ -654,6 +656,8 @@ async def __anext__(self) -> Any:
 
             raise  # Re-raise StopAsyncIteration
         except Exception as e:
-            self.span.update(level="ERROR", status_message=str(e) or type(e).__name__).end()
+            self.span.update(
+                level="ERROR", status_message=str(e) or type(e).__name__
+            ).end()
 
             raise
@@ -218,7 +218,8 @@ class CompositeEvaluatorFunction(Protocol):
     composite assessments based on individual evaluation results.
 
     Composite evaluators:
-    - Accept the original item and its list of evaluations
+    - Accept the same inputs as item-level evaluators (input, output, expected_output, metadata)
+      plus the list of evaluations
     - Return either a single Evaluation, a list of Evaluations, or a dict
     - Can be either synchronous or asynchronous
     - Have access to both raw item data and evaluation results
@@ -227,7 +228,10 @@ class CompositeEvaluatorFunction(Protocol):
     def __call__(
         self,
         *,
-        item: Union["TraceWithFullDetails", "ObservationsView"],
+        input: Optional[Any] = None,
+        output: Optional[Any] = None,
+        expected_output: Optional[Any] = None,
+        metadata: Optional[Dict[str, Any]] = None,
         evaluations: List[Evaluation],
         **kwargs: Dict[str, Any],
     ) -> Union[
@@ -245,8 +249,10 @@ def __call__(
         criteria, or custom scoring logic that considers multiple dimensions.
 
         Args:
-            item: The original API response object that was evaluated. Provides access
-                to the raw entity data if needed for composite scoring logic.
+            input: The input data that was provided to the system being evaluated.
+            output: The output generated by the system being evaluated.
+            expected_output: The expected/reference output for comparison (if available).
+            metadata: Additional metadata about the evaluation context.
             evaluations: List of evaluation results from item-level evaluators.
                 Each evaluation contains name, value, comment, and metadata.
 
@@ -266,7 +272,7 @@ def __call__(
         Examples:
             Simple weighted average:
             ```python
-            def weighted_composite(*, item, evaluations):
+            def weighted_composite(*, input, output, expected_output, metadata, evaluations):
                 weights = {
                     "accuracy": 0.5,
                     "relevance": 0.3,
@@ -292,7 +298,7 @@ def weighted_composite(*, item, evaluations):
 
             Pass/fail composite based on thresholds:
             ```python
-            def pass_fail_composite(*, item, evaluations):
+            def pass_fail_composite(*, input, output, expected_output, metadata, evaluations):
                 # Must pass all criteria
                 thresholds = {
                     "accuracy": 0.7,
@@ -320,13 +326,14 @@ def pass_fail_composite(*, item, evaluations):
 
             Async composite with external scoring:
             ```python
-            async def llm_composite(*, item, evaluations):
+            async def llm_composite(*, input, output, expected_output, metadata, evaluations):
                 # Use LLM to synthesize multiple evaluation results
                 eval_summary = "\n".join(
                     f"- {e.name}: {e.value}" for e in evaluations
                 )
 
                 prompt = f"Given these evaluation scores:\n{eval_summary}\n"
+                prompt += f"For the output: {output}\n"
                 prompt += "Provide an overall quality score from 0-1."
 
                 response = await openai.chat.completions.create(
@@ -345,12 +352,12 @@ async def llm_composite(*, item, evaluations):
 
             Context-aware composite:
             ```python
-            def context_composite(*, item, evaluations):
-                # Adjust weighting based on item characteristics
+            def context_composite(*, input, output, expected_output, metadata, evaluations):
+                # Adjust weighting based on metadata
                 base_weights = {"accuracy": 0.5, "speed": 0.3, "cost": 0.2}
 
-                # If item has high importance, prioritize accuracy
-                if hasattr(item, 'metadata') and item.metadata.get('importance') == 'high':
+                # If metadata indicates high importance, prioritize accuracy
+                if metadata and metadata.get('importance') == 'high':
                     weights = {"accuracy": 0.7, "speed": 0.2, "cost": 0.1}
                 else:
                     weights = base_weights
@@ -1211,7 +1218,10 @@ async def _process_batch_evaluation_item(
             try:
                 composite_evals = await self._run_composite_evaluator(
                     composite_evaluator,
-                    item=item,
+                    input=evaluator_inputs.input,
+                    output=evaluator_inputs.output,
+                    expected_output=evaluator_inputs.expected_output,
+                    metadata=evaluator_inputs.metadata,
                     evaluations=evaluations,
                 )
 
@@ -1289,14 +1299,20 @@ async def _run_mapper(
     async def _run_composite_evaluator(
         self,
         composite_evaluator: CompositeEvaluatorFunction,
-        item: Union[TraceWithFullDetails, ObservationsView],
+        input: Optional[Any],
+        output: Optional[Any],
+        expected_output: Optional[Any],
+        metadata: Optional[Dict[str, Any]],
         evaluations: List[Evaluation],
     ) -> List[Evaluation]:
         """Run composite evaluator function (handles both sync and async).
 
         Args:
             composite_evaluator: The composite evaluator function.
-            item: The original API response object.
+            input: The input data provided to the system.
+            output: The output generated by the system.
+            expected_output: The expected/reference output.
+            metadata: Additional metadata about the evaluation context.
             evaluations: List of item-level evaluations.
 
         Returns:
@@ -1305,7 +1321,13 @@ async def _run_composite_evaluator(
         Raises:
             Exception: If composite evaluator raises an exception.
         """
-        result = composite_evaluator(item=item, evaluations=evaluations)
+        result = composite_evaluator(
+            input=input,
+            output=output,
+            expected_output=expected_output,
+            metadata=metadata,
+            evaluations=evaluations,
+        )
         if asyncio.iscoroutine(result):
             result = await result
 
 
@@ -466,7 +466,7 @@ def accuracy_evaluator(*, input, output, **kwargs):
     def relevance_evaluator(*, input, output, **kwargs):
         return Evaluation(name="relevance", value=0.9)
 
-    def composite_evaluator(*, item, evaluations):
+    def composite_evaluator(*, input, output, expected_output, metadata, evaluations):
         weights = {"accuracy": 0.6, "relevance": 0.4}
         total = sum(
             e.value * weights.get(e.name, 0)
@@ -503,7 +503,7 @@ def metric1_evaluator(*, input, output, **kwargs):
     def metric2_evaluator(*, input, output, **kwargs):
         return Evaluation(name="metric2", value=0.7)
 
-    def pass_fail_composite(*, item, evaluations):
+    def pass_fail_composite(*, input, output, expected_output, metadata, evaluations):
         thresholds = {"metric1": 0.8, "metric2": 0.6}
 
         passes = all(
@@ -536,7 +536,7 @@ async def test_async_composite_evaluator(sample_traces, langfuse_client):
     def evaluator1(*, input, output, **kwargs):
         return Evaluation(name="eval1", value=0.8)
 
-    async def async_composite(*, item, evaluations):
+    async def async_composite(*, input, output, expected_output, metadata, evaluations):
         await asyncio.sleep(0.01)  # Simulate async processing
         avg = sum(
             e.value for e in evaluations if isinstance(e.value, (int, float))
@@ -560,7 +560,7 @@ def test_composite_evaluator_with_no_evaluations(sample_traces, langfuse_client)
     def always_failing_evaluator(*, input, output, **kwargs):
         raise Exception("Always fails")
 
-    def composite_evaluator(*, item, evaluations):
+    def composite_evaluator(*, input, output, expected_output, metadata, evaluations):
         # Should not be called if no evaluations succeed
         return Evaluation(name="composite", value=0.0)
 
@@ -582,7 +582,7 @@ def test_composite_evaluator_failure_handling(sample_traces, langfuse_client):
     def evaluator1(*, input, output, **kwargs):
         return Evaluation(name="eval1", value=0.8)
 
-    def failing_composite(*, item, evaluations):
+    def failing_composite(*, input, output, expected_output, metadata, evaluations):
         raise ValueError("Composite evaluator failed")
 
     result = langfuse_client.run_batched_evaluation(