add run_name

hassiebp · hassiebp · commit 32cbe0255e8c · 2025-09-16T23:05:13.000+02:00
diff --git a/langfuse/_client/client.py b/langfuse/_client/client.py
@@ -2463,6 +2463,7 @@ def run_experiment(
         self,
         *,
         name: str,
+        run_name: Optional[str] = None,
         description: Optional[str] = None,
         data: ExperimentData,
         task: TaskFunction,
@@ -2487,7 +2488,10 @@ def run_experiment(
 
         Args:
             name: Human-readable name for the experiment. Used for identification
-                in the Langfuse UI and for dataset run naming if using Langfuse datasets.
+                in the Langfuse UI.
+            run_name: Optional exact name for the experiment run. If provided, this will be
+                used as the exact dataset run name if the `data` contains Langfuse dataset items.
+                If not provided, this will default to the experiment name appended with an ISO timestamp.
             description: Optional description explaining the experiment's purpose,
                 methodology, or expected outcomes.
             data: Array of data items to process. Can be either:
@@ -2628,6 +2632,9 @@ def average_accuracy(*, item_results, **kwargs):
             run_async_safely(
                 self._run_experiment_async(
                     name=name,
+                    run_name=self._create_experiment_run_name(
+                        name=name, run_name=run_name
+                    ),
                     description=description,
                     data=data,
                     task=task,
@@ -2643,6 +2650,7 @@ async def _run_experiment_async(
         self,
         *,
         name: str,
+        run_name: str,
         description: Optional[str],
         data: ExperimentData,
         task: TaskFunction,
@@ -2651,7 +2659,9 @@ async def _run_experiment_async(
         max_concurrency: int,
         metadata: Dict[str, Any],
     ) -> ExperimentResult:
-        langfuse_logger.debug(f"Starting experiment '{name}' with {len(data)} items")
+        langfuse_logger.debug(
+            f"Starting experiment '{name}' run '{run_name}' with {len(data)} items"
+        )
 
         # Set up concurrency control
         semaphore = asyncio.Semaphore(max_concurrency)
@@ -2660,7 +2670,7 @@ async def _run_experiment_async(
         async def process_item(item: ExperimentItem) -> ExperimentItemResult:
             async with semaphore:
                 return await self._process_experiment_item(
-                    item, task, evaluators, name, description, metadata
+                    item, task, evaluators, name, run_name, description, metadata
                 )
 
         # Run all items concurrently
@@ -2728,6 +2738,7 @@ async def process_item(item: ExperimentItem) -> ExperimentItemResult:
 
         return ExperimentResult(
             name=name,
+            run_name=run_name,
             description=description,
             item_results=valid_results,
             run_evaluations=run_evaluations,
@@ -2741,6 +2752,7 @@ async def _process_experiment_item(
         task: Callable,
         evaluators: List[Callable],
         experiment_name: str,
+        experiment_run_name: str,
         experiment_description: Optional[str],
         experiment_metadata: Dict[str, Any],
     ) -> ExperimentItemResult:
@@ -2764,6 +2776,7 @@ async def _process_experiment_item(
 
                 final_metadata = {
                     "experiment_name": experiment_name,
+                    "experiment_run_name": experiment_run_name,
                     **experiment_metadata,
                 }
 
@@ -2796,7 +2809,7 @@ async def _process_experiment_item(
 
                         dataset_run_item = self.api.dataset_run_items.create(
                             request=CreateDatasetRunItemRequest(
-                                runName=experiment_name,
+                                runName=experiment_run_name,
                                 runDescription=experiment_description,
                                 metadata=experiment_metadata,
                                 datasetItemId=item.id,  # type: ignore
@@ -2864,6 +2877,16 @@ async def _process_experiment_item(
                 )
                 raise e
 
+    def _create_experiment_run_name(
+        self, *, name: Optional[str] = None, run_name: Optional[str] = None
+    ) -> str:
+        if run_name:
+            return run_name
+
+        iso_timestamp = _get_timestamp().isoformat().replace("+00:00", "Z")
+
+        return f"{name} - {iso_timestamp}"
+
     def auth_check(self) -> bool:
         """Check if the provided credentials (public and secret key) are valid.
 
diff --git a/langfuse/_client/datasets.py b/langfuse/_client/datasets.py
@@ -6,6 +6,7 @@
 
 from langfuse.experiment import (
     EvaluatorFunction,
+    ExperimentResult,
     RunEvaluatorFunction,
     TaskFunction,
 )
@@ -199,13 +200,14 @@ def run_experiment(
         self,
         *,
         name: str,
+        run_name: Optional[str] = None,
         description: Optional[str] = None,
         task: TaskFunction,
         evaluators: List[EvaluatorFunction] = [],
         run_evaluators: List[RunEvaluatorFunction] = [],
         max_concurrency: int = 50,
         metadata: Optional[Dict[str, Any]] = None,
-    ) -> Any:
+    ) -> ExperimentResult:
         """Run an experiment on this Langfuse dataset with automatic tracking.
 
         This is a convenience method that runs an experiment using all items in this
@@ -222,6 +224,9 @@ def run_experiment(
         Args:
             name: Human-readable name for the experiment run. This will be used as
                 the dataset run name in Langfuse for tracking and identification.
+            run_name: Optional exact name for the dataset run. If provided, this will be
+                used as the exact dataset run name in Langfuse. If not provided, this will
+                default to the experiment name appended with an ISO timestamp.
             description: Optional description of the experiment's purpose, methodology,
                 or what you're testing. Appears in the Langfuse UI for context.
             task: Function that processes each dataset item and returns output.
@@ -238,12 +243,13 @@ def run_experiment(
 
         Returns:
             ExperimentResult object containing:
-            - name: The experiment name
-            - description: Optional experiment description
-            - item_results: Results for each dataset item with outputs and evaluations
-            - run_evaluations: Aggregate evaluation results for the entire run
-            - dataset_run_id: ID of the created dataset run in Langfuse
-            - dataset_run_url: Direct URL to view the experiment results in Langfuse UI
+            - name: The experiment name.
+            - run_name: The experiment run name (equivalent to the dataset run name).
+            - description: Optional experiment description.
+            - item_results: Results for each dataset item with outputs and evaluations.
+            - run_evaluations: Aggregate evaluation results for the entire run.
+            - dataset_run_id: ID of the created dataset run in Langfuse.
+            - dataset_run_url: Direct URL to view the experiment results in Langfuse UI.
 
             The result object provides a format() method for human-readable output:
             ```python
@@ -253,7 +259,7 @@ def run_experiment(
             ```
 
         Raises:
-            ValueError: If the dataset has no items or no Langfuse client is available
+            ValueError: If the dataset has no items or no Langfuse client is available.
 
         Examples:
             Basic dataset experiment:
@@ -400,6 +406,7 @@ def content_diversity(*, item_results, **kwargs):
 
         return langfuse_client.run_experiment(
             name=name,
+            run_name=run_name,
             description=description,
             data=self.items,
             task=task,
diff --git a/langfuse/_client/span.py b/langfuse/_client/span.py
@@ -1468,19 +1468,19 @@ def start_as_current_generation(
         return self.start_as_current_observation(
             name=name,
             as_type="generation",
-                input=input,
-                output=output,
-                metadata=metadata,
-                version=version,
-                level=level,
-                status_message=status_message,
-                completion_start_time=completion_start_time,
-                model=model,
-                model_parameters=model_parameters,
-                usage_details=usage_details,
-                cost_details=cost_details,
-                prompt=prompt,
-            )
+            input=input,
+            output=output,
+            metadata=metadata,
+            version=version,
+            level=level,
+            status_message=status_message,
+            completion_start_time=completion_start_time,
+            model=model,
+            model_parameters=model_parameters,
+            usage_details=usage_details,
+            cost_details=cost_details,
+            prompt=prompt,
+        )
 
     def create_event(
         self,
diff --git a/langfuse/experiment.py b/langfuse/experiment.py
@@ -114,8 +114,9 @@ class Evaluation:
         metadata: Optional structured metadata about the evaluation process.
             Can include confidence scores, intermediate calculations, model versions,
             or any other relevant technical details.
-        data_type: Optional score data type, required if value is not NUMERIC; one of NUMERIC, CATEGORICAL, or BOOLEAN; default: NUMERIC
-        config_id: Optional Langfuse score config id
+        data_type: Optional score data type. Required if value is not NUMERIC.
+            One of NUMERIC, CATEGORICAL, or BOOLEAN. Defaults to NUMERIC.
+        config_id: Optional Langfuse score config ID.
 
     Examples:
         Basic accuracy evaluation:
@@ -194,12 +195,12 @@ def __init__(
         """Initialize an Evaluation with the provided data.
 
         Args:
-            name: Unique identifier for the evaluation metric
-            value: The evaluation score or result
-            comment: Optional human-readable explanation of the result
-            metadata: Optional structured metadata about the evaluation process
-            data_type: Optional score data type (NUMERIC, CATEGORICAL, or BOOLEAN)
-            config_id: Optional Langfuse score config id
+            name: Unique identifier for the evaluation metric.
+            value: The evaluation score or result.
+            comment: Optional human-readable explanation of the result.
+            metadata: Optional structured metadata about the evaluation process.
+            data_type: Optional score data type (NUMERIC, CATEGORICAL, or BOOLEAN).
+            config_id: Optional Langfuse score config ID.
 
         Note:
             All arguments must be provided as keywords. Positional arguments will raise a TypeError.
@@ -276,11 +277,11 @@ def __init__(
         """Initialize an ExperimentItemResult with the provided data.
 
         Args:
-            item: The original experiment item that was processed
-            output: The actual output produced by the task function for this item
-            evaluations: List of evaluation results for this item
-            trace_id: Optional Langfuse trace ID for this item's execution
-            dataset_run_id: Optional dataset run ID if this item was part of a Langfuse dataset
+            item: The original experiment item that was processed.
+            output: The actual output produced by the task function for this item.
+            evaluations: List of evaluation results for this item.
+            trace_id: Optional Langfuse trace ID for this item's execution.
+            dataset_run_id: Optional dataset run ID if this item was part of a Langfuse dataset.
 
         Note:
             All arguments must be provided as keywords. Positional arguments will raise a TypeError.
@@ -300,14 +301,15 @@ class ExperimentResult:
     about the experiment execution.
 
     Attributes:
-        name: The name of the experiment as specified during execution
-        description: Optional description of the experiment's purpose or methodology
+        name: The name of the experiment as specified during execution.
+        run_name: The name of the current experiment run.
+        description: Optional description of the experiment's purpose or methodology.
         item_results: List of results from processing each individual dataset item,
-            containing the original item, task output, evaluations, and trace information
+            containing the original item, task output, evaluations, and trace information.
         run_evaluations: List of aggregate evaluation results computed across all items,
-            such as average scores, statistical summaries, or cross-item analyses
-        dataset_run_id: Optional ID of the dataset run in Langfuse (when using Langfuse datasets)
-        dataset_run_url: Optional direct URL to view the experiment results in Langfuse UI
+            such as average scores, statistical summaries, or cross-item analyses.
+        dataset_run_id: Optional ID of the dataset run in Langfuse (when using Langfuse datasets).
+        dataset_run_url: Optional direct URL to view the experiment results in Langfuse UI.
 
     Examples:
         Basic usage with local dataset:
@@ -360,6 +362,7 @@ def __init__(
         self,
         *,
         name: str,
+        run_name: str,
         description: Optional[str],
         item_results: List[ExperimentItemResult],
         run_evaluations: List[Evaluation],
@@ -369,14 +372,16 @@ def __init__(
         """Initialize an ExperimentResult with the provided data.
 
         Args:
-            name: The name of the experiment
-            description: Optional description of the experiment
-            item_results: List of results from processing individual dataset items
-            run_evaluations: List of aggregate evaluation results for the entire run
-            dataset_run_id: Optional ID of the dataset run (for Langfuse datasets)
-            dataset_run_url: Optional URL to view results in Langfuse UI
+            name: The name of the experiment.
+            run_name: The current experiment run name.
+            description: Optional description of the experiment.
+            item_results: List of results from processing individual dataset items.
+            run_evaluations: List of aggregate evaluation results for the entire run.
+            dataset_run_id: Optional ID of the dataset run (for Langfuse datasets).
+            dataset_run_url: Optional URL to view results in Langfuse UI.
         """
         self.name = name
+        self.run_name = run_name
         self.description = description
         self.item_results = item_results
         self.run_evaluations = run_evaluations
@@ -526,7 +531,8 @@ def format(self, *, include_item_results: bool = False) -> str:
 
         # Experiment overview section
         output += f"\\n{'─' * 50}\\n"
-        output += f"📊 {self.name}"
+        output += f"🧪 Experiment: {self.name}"
+        output += f"\n📋 Run name: {self.run_name}"
         if self.description:
             output += f" - {self.description}"
 
diff --git a/tests/test_experiments.py b/tests/test_experiments.py