@@ -96,6 +96,48 @@ def test_run_experiment_on_local_dataset(sample_dataset):
9696 langfuse_client .flush ()
9797 time .sleep (2 )
9898
99+ # Validate traces are correctly persisted with input/output/metadata
100+ api = get_api ()
101+ expected_inputs = ["Germany" , "France" , "Spain" ]
102+ expected_outputs = ["Capital of Germany" , "Capital of France" , "Capital of Spain" ]
103+
104+ for i , item_result in enumerate (result ["item_results" ]):
105+ trace_id = item_result ["trace_id" ]
106+ assert trace_id is not None , f"Item { i } should have a trace_id"
107+
108+ # Fetch trace from API
109+ trace = api .trace .get (trace_id )
110+ assert trace is not None , f"Trace { trace_id } should exist"
111+
112+ # Validate trace name
113+ assert (
114+ trace .name == "experiment-item-run"
115+ ), f"Trace { trace_id } should have correct name"
116+
117+ # Validate trace input - should contain the experiment item
118+ assert trace .input is not None , f"Trace { trace_id } should have input"
119+ expected_input = expected_inputs [i ]
120+ # The input should contain the item data in some form
121+ assert expected_input in str (
122+ trace .input
123+ ), f"Trace { trace_id } input should contain '{ expected_input } '"
124+
125+ # Validate trace output - should be the task result
126+ assert trace .output is not None , f"Trace { trace_id } should have output"
127+ expected_output = expected_outputs [i ]
128+ assert (
129+ trace .output == expected_output
130+ ), f"Trace { trace_id } output should be '{ expected_output } ', got '{ trace .output } '"
131+
132+ # Validate trace metadata contains experiment name
133+ assert trace .metadata is not None , f"Trace { trace_id } should have metadata"
134+ assert (
135+ "experiment_name" in trace .metadata
136+ ), f"Trace { trace_id } metadata should contain experiment_name"
137+ assert (
138+ trace .metadata ["experiment_name" ] == "Euro capitals"
139+ ), f"Trace { trace_id } metadata should have correct experiment_name"
140+
99141
100142def test_run_experiment_on_langfuse_dataset ():
101143 """Test running experiment on Langfuse dataset."""
@@ -120,8 +162,10 @@ def test_run_experiment_on_langfuse_dataset():
120162 # Get dataset and run experiment
121163 dataset = langfuse_client .get_dataset (dataset_name )
122164
165+ # Use unique experiment name for proper identification
166+ experiment_name = "Dataset Test " + create_uuid ()[:8 ]
123167 result = dataset .run_experiment (
124- name = "Dataset Test" ,
168+ name = experiment_name ,
125169 description = "Test on Langfuse dataset" ,
126170 task = mock_task ,
127171 evaluators = [factuality_evaluator ],
@@ -142,6 +186,110 @@ def test_run_experiment_on_langfuse_dataset():
142186 runs = api .datasets .get_runs (dataset_name )
143187 assert len (runs .data ) >= 1
144188
189+ # Validate traces are correctly persisted with input/output/metadata
190+ expected_data = {"Germany" : "Capital of Germany" , "France" : "Capital of France" }
191+ dataset_run_id = result ["dataset_run_id" ]
192+
193+ # Create a mapping from dataset item ID to dataset item for validation
194+ dataset_item_map = {item .id : item for item in dataset .items }
195+
196+ for i , item_result in enumerate (result ["item_results" ]):
197+ trace_id = item_result ["trace_id" ]
198+ assert trace_id is not None , f"Item { i } should have a trace_id"
199+
200+ # Fetch trace from API
201+ trace = api .trace .get (trace_id )
202+ assert trace is not None , f"Trace { trace_id } should exist"
203+
204+ # Validate trace name
205+ assert (
206+ trace .name == "experiment-item-run"
207+ ), f"Trace { trace_id } should have correct name"
208+
209+ # Validate trace input and output match expected pairs
210+ assert trace .input is not None , f"Trace { trace_id } should have input"
211+ trace_input_str = str (trace .input )
212+
213+ # Find which expected input this trace corresponds to
214+ matching_input = None
215+ for expected_input in expected_data .keys ():
216+ if expected_input in trace_input_str :
217+ matching_input = expected_input
218+ break
219+
220+ assert (
221+ matching_input is not None
222+ ), f"Trace { trace_id } input '{ trace_input_str } ' should contain one of { list (expected_data .keys ())} "
223+
224+ # Validate trace output matches the expected output for this input
225+ assert trace .output is not None , f"Trace { trace_id } should have output"
226+ expected_output = expected_data [matching_input ]
227+ assert (
228+ trace .output == expected_output
229+ ), f"Trace { trace_id } output should be '{ expected_output } ', got '{ trace .output } '"
230+
231+ # Validate trace metadata contains experiment and dataset info
232+ assert trace .metadata is not None , f"Trace { trace_id } should have metadata"
233+ assert (
234+ "experiment_name" in trace .metadata
235+ ), f"Trace { trace_id } metadata should contain experiment_name"
236+ assert (
237+ trace .metadata ["experiment_name" ] == experiment_name
238+ ), f"Trace { trace_id } metadata should have correct experiment_name"
239+
240+ # Validate dataset-specific metadata fields
241+ assert (
242+ "dataset_id" in trace .metadata
243+ ), f"Trace { trace_id } metadata should contain dataset_id"
244+ assert (
245+ trace .metadata ["dataset_id" ] == dataset .id
246+ ), f"Trace { trace_id } metadata should have correct dataset_id"
247+
248+ assert (
249+ "dataset_item_id" in trace .metadata
250+ ), f"Trace { trace_id } metadata should contain dataset_item_id"
251+ # Get the dataset item ID from metadata and validate it exists
252+ dataset_item_id = trace .metadata ["dataset_item_id" ]
253+ assert (
254+ dataset_item_id in dataset_item_map
255+ ), f"Trace { trace_id } metadata dataset_item_id should correspond to a valid dataset item"
256+
257+ # Validate the dataset item input matches the trace input
258+ dataset_item = dataset_item_map [dataset_item_id ]
259+ assert (
260+ dataset_item .input == matching_input
261+ ), f"Trace { trace_id } should correspond to dataset item with input '{ matching_input } '"
262+
263+ # Verify dataset run contains the correct trace IDs
264+ dataset_run = None
265+ for run in runs .data :
266+ if run .id == dataset_run_id :
267+ dataset_run = run
268+ break
269+
270+ assert dataset_run is not None , f"Dataset run { dataset_run_id } should exist"
271+ assert dataset_run .name == experiment_name , "Dataset run should have correct name"
272+ assert (
273+ dataset_run .description == "Test on Langfuse dataset"
274+ ), "Dataset run should have correct description"
275+
276+ # Get dataset run items to verify trace linkage
277+ dataset_run_items = api .dataset_run_items .list (
278+ dataset_id = dataset .id , run_name = experiment_name
279+ )
280+ assert len (dataset_run_items .data ) == 2 , "Dataset run should have 2 items"
281+
282+ # Verify each dataset run item links to the correct trace
283+ run_item_trace_ids = {
284+ item .trace_id for item in dataset_run_items .data if item .trace_id
285+ }
286+ result_trace_ids = {item ["trace_id" ] for item in result ["item_results" ]}
287+
288+ assert run_item_trace_ids == result_trace_ids , (
289+ f"Dataset run items should link to the same traces as experiment results. "
290+ f"Run items: { run_item_trace_ids } , Results: { result_trace_ids } "
291+ )
292+
145293
146294# Error Handling Tests
147295def test_evaluator_failures_handled_gracefully ():
0 commit comments