77
88import asyncio
99import logging
10- from dataclasses import dataclass
1110from typing import (
1211 TYPE_CHECKING ,
1312 Any ,
@@ -94,13 +93,11 @@ class LocalExperimentItem(TypedDict, total=False):
9493"""
9594
9695
97- @dataclass (frozen = True )
9896class Evaluation :
99- """Represents an evaluation result for an experiment item.
97+ """Represents an evaluation result for an experiment item or an entire experiment run .
10098
10199 This class provides a strongly-typed way to create evaluation results in evaluator functions.
102- Users should import this class and return instances instead of dictionaries for better
103- type safety and IDE support.
100+ Users must use keyword arguments when instantiating this class.
104101
105102 Attributes:
106103 name: Unique identifier for the evaluation metric. Should be descriptive
@@ -117,7 +114,7 @@ class Evaluation:
117114 metadata: Optional structured metadata about the evaluation process.
118115 Can include confidence scores, intermediate calculations, model versions,
119116 or any other relevant technical details.
120- data_type: Optional score data type; one of NUMERIC, CATEGORICAL, or BOOLEAN; default: NUMERIC
117+ data_type: Optional score data type, required if value is not NUMERIC ; one of NUMERIC, CATEGORICAL, or BOOLEAN; default: NUMERIC
121118 config_id: Optional Langfuse score config id
122119
123120 Examples:
@@ -180,25 +177,47 @@ def external_api_evaluator(*, input, output, **kwargs):
180177 ```
181178
182179 Note:
183- This class is immutable (frozen=True) to ensure evaluation results cannot be
184- accidentally modified after creation. All fields except name and value are optional .
180+ All arguments must be passed as keywords. Positional arguments are not allowed
181+ to ensure code clarity and prevent errors from argument reordering .
185182 """
186183
187- name : str
188- value : Union [int , float , str , bool , None ]
189- comment : Optional [str ] = None
190- metadata : Optional [Dict [str , Any ]] = None
191- data_type : Optional [ScoreDataType ] = None
192- config_id : Optional [str ] = None
184+ def __init__ (
185+ self ,
186+ * ,
187+ name : str ,
188+ value : Union [int , float , str , bool , None ],
189+ comment : Optional [str ] = None ,
190+ metadata : Optional [Dict [str , Any ]] = None ,
191+ data_type : Optional [ScoreDataType ] = None ,
192+ config_id : Optional [str ] = None ,
193+ ):
194+ """Initialize an Evaluation with the provided data.
195+
196+ Args:
197+ name: Unique identifier for the evaluation metric
198+ value: The evaluation score or result
199+ comment: Optional human-readable explanation of the result
200+ metadata: Optional structured metadata about the evaluation process
201+ data_type: Optional score data type (NUMERIC, CATEGORICAL, or BOOLEAN)
202+ config_id: Optional Langfuse score config id
203+
204+ Note:
205+ All arguments must be provided as keywords. Positional arguments will raise a TypeError.
206+ """
207+ self .name = name
208+ self .value = value
209+ self .comment = comment
210+ self .metadata = metadata
211+ self .data_type = data_type
212+ self .config_id = config_id
193213
194214
195- @dataclass (frozen = True )
196215class ExperimentItemResult :
197216 """Result structure for individual experiment items.
198217
199- This dataclass represents the complete result of processing a single item
218+ This class represents the complete result of processing a single item
200219 during an experiment run, including the original input, task output,
201- evaluations, and tracing information.
220+ evaluations, and tracing information. Users must use keyword arguments when instantiating this class.
202221
203222 Attributes:
204223 item: The original experiment item that was processed. Can be either
@@ -239,13 +258,38 @@ class ExperimentItemResult:
239258 input_data = item_result.item.input
240259 expected = item_result.item.expected_output
241260 ```
261+
262+ Note:
263+ All arguments must be passed as keywords. Positional arguments are not allowed
264+ to ensure code clarity and prevent errors from argument reordering.
242265 """
243266
244- item : ExperimentItem
245- output : Any
246- evaluations : List [Evaluation ]
247- trace_id : Optional [str ]
248- dataset_run_id : Optional [str ]
267+ def __init__ (
268+ self ,
269+ * ,
270+ item : ExperimentItem ,
271+ output : Any ,
272+ evaluations : List [Evaluation ],
273+ trace_id : Optional [str ],
274+ dataset_run_id : Optional [str ],
275+ ):
276+ """Initialize an ExperimentItemResult with the provided data.
277+
278+ Args:
279+ item: The original experiment item that was processed
280+ output: The actual output produced by the task function for this item
281+ evaluations: List of evaluation results for this item
282+ trace_id: Optional Langfuse trace ID for this item's execution
283+ dataset_run_id: Optional dataset run ID if this item was part of a Langfuse dataset
284+
285+ Note:
286+ All arguments must be provided as keywords. Positional arguments will raise a TypeError.
287+ """
288+ self .item = item
289+ self .output = output
290+ self .evaluations = evaluations
291+ self .trace_id = trace_id
292+ self .dataset_run_id = dataset_run_id
249293
250294
251295class ExperimentResult :
@@ -314,6 +358,7 @@ class ExperimentResult:
314358
315359 def __init__ (
316360 self ,
361+ * ,
317362 name : str ,
318363 description : Optional [str ],
319364 item_results : List [ExperimentItemResult ],
@@ -938,7 +983,7 @@ async def _run_evaluator(
938983
939984 # Normalize to list
940985 if isinstance (result , (dict , Evaluation )):
941- return [result ]
986+ return [result ] # type: ignore
942987
943988 elif isinstance (result , list ):
944989 return result
0 commit comments