Skip to content

Commit ce290f5

Browse files
committed
expand tests
1 parent f5f2cac commit ce290f5

1 file changed

Lines changed: 149 additions & 1 deletion

File tree

tests/test_experiments.py

Lines changed: 149 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,48 @@ def test_run_experiment_on_local_dataset(sample_dataset):
9696
langfuse_client.flush()
9797
time.sleep(2)
9898

99+
# Validate traces are correctly persisted with input/output/metadata
100+
api = get_api()
101+
expected_inputs = ["Germany", "France", "Spain"]
102+
expected_outputs = ["Capital of Germany", "Capital of France", "Capital of Spain"]
103+
104+
for i, item_result in enumerate(result["item_results"]):
105+
trace_id = item_result["trace_id"]
106+
assert trace_id is not None, f"Item {i} should have a trace_id"
107+
108+
# Fetch trace from API
109+
trace = api.trace.get(trace_id)
110+
assert trace is not None, f"Trace {trace_id} should exist"
111+
112+
# Validate trace name
113+
assert (
114+
trace.name == "experiment-item-run"
115+
), f"Trace {trace_id} should have correct name"
116+
117+
# Validate trace input - should contain the experiment item
118+
assert trace.input is not None, f"Trace {trace_id} should have input"
119+
expected_input = expected_inputs[i]
120+
# The input should contain the item data in some form
121+
assert expected_input in str(
122+
trace.input
123+
), f"Trace {trace_id} input should contain '{expected_input}'"
124+
125+
# Validate trace output - should be the task result
126+
assert trace.output is not None, f"Trace {trace_id} should have output"
127+
expected_output = expected_outputs[i]
128+
assert (
129+
trace.output == expected_output
130+
), f"Trace {trace_id} output should be '{expected_output}', got '{trace.output}'"
131+
132+
# Validate trace metadata contains experiment name
133+
assert trace.metadata is not None, f"Trace {trace_id} should have metadata"
134+
assert (
135+
"experiment_name" in trace.metadata
136+
), f"Trace {trace_id} metadata should contain experiment_name"
137+
assert (
138+
trace.metadata["experiment_name"] == "Euro capitals"
139+
), f"Trace {trace_id} metadata should have correct experiment_name"
140+
99141

100142
def test_run_experiment_on_langfuse_dataset():
101143
"""Test running experiment on Langfuse dataset."""
@@ -120,8 +162,10 @@ def test_run_experiment_on_langfuse_dataset():
120162
# Get dataset and run experiment
121163
dataset = langfuse_client.get_dataset(dataset_name)
122164

165+
# Use unique experiment name for proper identification
166+
experiment_name = "Dataset Test " + create_uuid()[:8]
123167
result = dataset.run_experiment(
124-
name="Dataset Test",
168+
name=experiment_name,
125169
description="Test on Langfuse dataset",
126170
task=mock_task,
127171
evaluators=[factuality_evaluator],
@@ -142,6 +186,110 @@ def test_run_experiment_on_langfuse_dataset():
142186
runs = api.datasets.get_runs(dataset_name)
143187
assert len(runs.data) >= 1
144188

189+
# Validate traces are correctly persisted with input/output/metadata
190+
expected_data = {"Germany": "Capital of Germany", "France": "Capital of France"}
191+
dataset_run_id = result["dataset_run_id"]
192+
193+
# Create a mapping from dataset item ID to dataset item for validation
194+
dataset_item_map = {item.id: item for item in dataset.items}
195+
196+
for i, item_result in enumerate(result["item_results"]):
197+
trace_id = item_result["trace_id"]
198+
assert trace_id is not None, f"Item {i} should have a trace_id"
199+
200+
# Fetch trace from API
201+
trace = api.trace.get(trace_id)
202+
assert trace is not None, f"Trace {trace_id} should exist"
203+
204+
# Validate trace name
205+
assert (
206+
trace.name == "experiment-item-run"
207+
), f"Trace {trace_id} should have correct name"
208+
209+
# Validate trace input and output match expected pairs
210+
assert trace.input is not None, f"Trace {trace_id} should have input"
211+
trace_input_str = str(trace.input)
212+
213+
# Find which expected input this trace corresponds to
214+
matching_input = None
215+
for expected_input in expected_data.keys():
216+
if expected_input in trace_input_str:
217+
matching_input = expected_input
218+
break
219+
220+
assert (
221+
matching_input is not None
222+
), f"Trace {trace_id} input '{trace_input_str}' should contain one of {list(expected_data.keys())}"
223+
224+
# Validate trace output matches the expected output for this input
225+
assert trace.output is not None, f"Trace {trace_id} should have output"
226+
expected_output = expected_data[matching_input]
227+
assert (
228+
trace.output == expected_output
229+
), f"Trace {trace_id} output should be '{expected_output}', got '{trace.output}'"
230+
231+
# Validate trace metadata contains experiment and dataset info
232+
assert trace.metadata is not None, f"Trace {trace_id} should have metadata"
233+
assert (
234+
"experiment_name" in trace.metadata
235+
), f"Trace {trace_id} metadata should contain experiment_name"
236+
assert (
237+
trace.metadata["experiment_name"] == experiment_name
238+
), f"Trace {trace_id} metadata should have correct experiment_name"
239+
240+
# Validate dataset-specific metadata fields
241+
assert (
242+
"dataset_id" in trace.metadata
243+
), f"Trace {trace_id} metadata should contain dataset_id"
244+
assert (
245+
trace.metadata["dataset_id"] == dataset.id
246+
), f"Trace {trace_id} metadata should have correct dataset_id"
247+
248+
assert (
249+
"dataset_item_id" in trace.metadata
250+
), f"Trace {trace_id} metadata should contain dataset_item_id"
251+
# Get the dataset item ID from metadata and validate it exists
252+
dataset_item_id = trace.metadata["dataset_item_id"]
253+
assert (
254+
dataset_item_id in dataset_item_map
255+
), f"Trace {trace_id} metadata dataset_item_id should correspond to a valid dataset item"
256+
257+
# Validate the dataset item input matches the trace input
258+
dataset_item = dataset_item_map[dataset_item_id]
259+
assert (
260+
dataset_item.input == matching_input
261+
), f"Trace {trace_id} should correspond to dataset item with input '{matching_input}'"
262+
263+
# Verify dataset run contains the correct trace IDs
264+
dataset_run = None
265+
for run in runs.data:
266+
if run.id == dataset_run_id:
267+
dataset_run = run
268+
break
269+
270+
assert dataset_run is not None, f"Dataset run {dataset_run_id} should exist"
271+
assert dataset_run.name == experiment_name, "Dataset run should have correct name"
272+
assert (
273+
dataset_run.description == "Test on Langfuse dataset"
274+
), "Dataset run should have correct description"
275+
276+
# Get dataset run items to verify trace linkage
277+
dataset_run_items = api.dataset_run_items.list(
278+
dataset_id=dataset.id, run_name=experiment_name
279+
)
280+
assert len(dataset_run_items.data) == 2, "Dataset run should have 2 items"
281+
282+
# Verify each dataset run item links to the correct trace
283+
run_item_trace_ids = {
284+
item.trace_id for item in dataset_run_items.data if item.trace_id
285+
}
286+
result_trace_ids = {item["trace_id"] for item in result["item_results"]}
287+
288+
assert run_item_trace_ids == result_trace_ids, (
289+
f"Dataset run items should link to the same traces as experiment results. "
290+
f"Run items: {run_item_trace_ids}, Results: {result_trace_ids}"
291+
)
292+
145293

146294
# Error Handling Tests
147295
def test_evaluator_failures_handled_gracefully():

0 commit comments

Comments
 (0)