@@ -218,7 +218,8 @@ class CompositeEvaluatorFunction(Protocol):
218218 composite assessments based on individual evaluation results.
219219
220220 Composite evaluators:
221- - Accept the original item and its list of evaluations
221+ - Accept the same inputs as item-level evaluators (input, output, expected_output, metadata)
222+ plus the list of evaluations
222223 - Return either a single Evaluation, a list of Evaluations, or a dict
223224 - Can be either synchronous or asynchronous
224225 - Have access to both raw item data and evaluation results
@@ -227,7 +228,10 @@ class CompositeEvaluatorFunction(Protocol):
227228 def __call__ (
228229 self ,
229230 * ,
230- item : Union ["TraceWithFullDetails" , "ObservationsView" ],
231+ input : Optional [Any ] = None ,
232+ output : Optional [Any ] = None ,
233+ expected_output : Optional [Any ] = None ,
234+ metadata : Optional [Dict [str , Any ]] = None ,
231235 evaluations : List [Evaluation ],
232236 ** kwargs : Dict [str , Any ],
233237 ) -> Union [
@@ -245,8 +249,10 @@ def __call__(
245249 criteria, or custom scoring logic that considers multiple dimensions.
246250
247251 Args:
248- item: The original API response object that was evaluated. Provides access
249- to the raw entity data if needed for composite scoring logic.
252+ input: The input data that was provided to the system being evaluated.
253+ output: The output generated by the system being evaluated.
254+ expected_output: The expected/reference output for comparison (if available).
255+ metadata: Additional metadata about the evaluation context.
250256 evaluations: List of evaluation results from item-level evaluators.
251257 Each evaluation contains name, value, comment, and metadata.
252258
@@ -266,7 +272,7 @@ def __call__(
266272 Examples:
267273 Simple weighted average:
268274 ```python
269- def weighted_composite(*, item , evaluations):
275+ def weighted_composite(*, input, output, expected_output, metadata , evaluations):
270276 weights = {
271277 "accuracy": 0.5,
272278 "relevance": 0.3,
@@ -292,7 +298,7 @@ def weighted_composite(*, item, evaluations):
292298
293299 Pass/fail composite based on thresholds:
294300 ```python
295- def pass_fail_composite(*, item , evaluations):
301+ def pass_fail_composite(*, input, output, expected_output, metadata , evaluations):
296302 # Must pass all criteria
297303 thresholds = {
298304 "accuracy": 0.7,
@@ -320,13 +326,14 @@ def pass_fail_composite(*, item, evaluations):
320326
321327 Async composite with external scoring:
322328 ```python
323- async def llm_composite(*, item , evaluations):
329+ async def llm_composite(*, input, output, expected_output, metadata , evaluations):
324330 # Use LLM to synthesize multiple evaluation results
325331 eval_summary = "\n".join(
326332 f"- {e.name}: {e.value}" for e in evaluations
327333 )
328334
329335 prompt = f"Given these evaluation scores:\n{eval_summary}\n"
336+ prompt += f"For the output: {output}\n"
330337 prompt += "Provide an overall quality score from 0-1."
331338
332339 response = await openai.chat.completions.create(
@@ -345,12 +352,12 @@ async def llm_composite(*, item, evaluations):
345352
346353 Context-aware composite:
347354 ```python
348- def context_composite(*, item , evaluations):
349- # Adjust weighting based on item characteristics
355+ def context_composite(*, input, output, expected_output, metadata , evaluations):
356+ # Adjust weighting based on metadata
350357 base_weights = {"accuracy": 0.5, "speed": 0.3, "cost": 0.2}
351358
352- # If item has high importance, prioritize accuracy
353- if hasattr(item, ' metadata') and item. metadata.get('importance') == 'high':
359+ # If metadata indicates high importance, prioritize accuracy
360+ if metadata and metadata.get('importance') == 'high':
354361 weights = {"accuracy": 0.7, "speed": 0.2, "cost": 0.1}
355362 else:
356363 weights = base_weights
@@ -1211,7 +1218,10 @@ async def _process_batch_evaluation_item(
12111218 try :
12121219 composite_evals = await self ._run_composite_evaluator (
12131220 composite_evaluator ,
1214- item = item ,
1221+ input = evaluator_inputs .input ,
1222+ output = evaluator_inputs .output ,
1223+ expected_output = evaluator_inputs .expected_output ,
1224+ metadata = evaluator_inputs .metadata ,
12151225 evaluations = evaluations ,
12161226 )
12171227
@@ -1289,14 +1299,20 @@ async def _run_mapper(
12891299 async def _run_composite_evaluator (
12901300 self ,
12911301 composite_evaluator : CompositeEvaluatorFunction ,
1292- item : Union [TraceWithFullDetails , ObservationsView ],
1302+ input : Optional [Any ],
1303+ output : Optional [Any ],
1304+ expected_output : Optional [Any ],
1305+ metadata : Optional [Dict [str , Any ]],
12931306 evaluations : List [Evaluation ],
12941307 ) -> List [Evaluation ]:
12951308 """Run composite evaluator function (handles both sync and async).
12961309
12971310 Args:
12981311 composite_evaluator: The composite evaluator function.
1299- item: The original API response object.
1312+ input: The input data provided to the system.
1313+ output: The output generated by the system.
1314+ expected_output: The expected/reference output.
1315+ metadata: Additional metadata about the evaluation context.
13001316 evaluations: List of item-level evaluations.
13011317
13021318 Returns:
@@ -1305,7 +1321,13 @@ async def _run_composite_evaluator(
13051321 Raises:
13061322 Exception: If composite evaluator raises an exception.
13071323 """
1308- result = composite_evaluator (item = item , evaluations = evaluations )
1324+ result = composite_evaluator (
1325+ input = input ,
1326+ output = output ,
1327+ expected_output = expected_output ,
1328+ metadata = metadata ,
1329+ evaluations = evaluations ,
1330+ )
13091331 if asyncio .iscoroutine (result ):
13101332 result = await result
13111333
0 commit comments