Merge remote-tracking branch 'origin/main' into codex/split-test-suites-by-directory

hassiebp · hassiebp · commit 82bfb880a1f2 · 2026-04-04T17:07:30.000+02:00
diff --git a/.github/dependabot.yml b/.github/dependabot.yml
@@ -21,3 +21,17 @@ updates:
       llama-index:
         patterns:
           - "llama-index*"
+
+  - package-ecosystem: "github-actions"
+    directory: "/"
+    schedule:
+      interval: "daily"
+    rebase-strategy: "disabled"
+    commit-message:
+      prefix: chore
+      prefix-development: chore
+      include: scope
+    groups:
+      github-actions:
+        patterns:
+          - "*"
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -18,9 +18,9 @@ jobs:
   linting:
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6
       - name: Install uv and set Python version
-        uses: astral-sh/setup-uv@v7
+        uses: astral-sh/setup-uv@cec208311dfd045dd5311c1add060b2062131d57 # v8
         with:
           version: "0.11.2"
           python-version: "3.13"
@@ -33,14 +33,14 @@ jobs:
   type-checking:
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6
       - name: Install uv and set Python version
-        uses: astral-sh/setup-uv@v7
+        uses: astral-sh/setup-uv@cec208311dfd045dd5311c1add060b2062131d57 # v8
         with:
           version: "0.11.2"
           python-version: "3.13"
           enable-cache: true
-      - uses: actions/cache@v3
+      - uses: actions/cache@668228422ae6a00e4ad889ee87cd7109ec5666a7 # v5
         name: Cache mypy cache
         with:
           path: ./.mypy_cache
@@ -105,8 +105,8 @@ jobs:
 
     name: E2E tests on Python 3.13
     steps:
-      - uses: actions/checkout@v3
-      - uses: pnpm/action-setup@v3
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6
+      - uses: pnpm/action-setup@fc06bc1257f339d1d5d8b3a19a8cae5388b55320 # v5
         with:
           version: 10.33.0
 
@@ -115,12 +115,12 @@ jobs:
           git clone https://github.com/langfuse/langfuse.git ./langfuse-server && echo $(cd ./langfuse-server && git rev-parse HEAD)
 
       - name: Setup node (for langfuse server)
-        uses: actions/setup-node@v3
+        uses: actions/setup-node@53b83947a5a98c8d113130e565377fae1a50d02f # v6.3.0
         with:
           node-version: 24
 
       - name: Cache langfuse server dependencies
-        uses: actions/cache@v3
+        uses: actions/cache@668228422ae6a00e4ad889ee87cd7109ec5666a7 # v5
         with:
           path: ./langfuse-server/node_modules
           key: |
@@ -184,7 +184,7 @@ jobs:
           echo "Langfuse server is up and running!"
 
       - name: Install uv and set Python version
-        uses: astral-sh/setup-uv@v7
+        uses: astral-sh/setup-uv@cec208311dfd045dd5311c1add060b2062131d57 # v8
         with:
           version: "0.11.2"
           python-version: "3.13"
diff --git a/.github/workflows/claude-review-maintainer-prs.yml b/.github/workflows/claude-review-maintainer-prs.yml
@@ -16,7 +16,7 @@ jobs:
     steps:
       - name: Check author permission and existing review request
         id: check
-        uses: actions/github-script@v7
+        uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0
         with:
           script: |
             const owner = context.repo.owner;
@@ -57,7 +57,7 @@ jobs:
 
       - name: Add Claude review comment
         if: steps.check.outputs.should_comment == 'true'
-        uses: actions/github-script@v7
+        uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0
         with:
           script: |
             await github.rest.issues.createComment({
diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml
@@ -55,11 +55,11 @@ jobs:
         # your codebase is analyzed, see https://docs.github.com/en/code-security/code-scanning/creating-an-advanced-setup-for-code-scanning/codeql-code-scanning-for-compiled-languages
     steps:
     - name: Checkout repository
-      uses: actions/checkout@v4
+      uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6
 
     # Initializes the CodeQL tools for scanning.
     - name: Initialize CodeQL
-      uses: github/codeql-action/init@v3
+      uses: github/codeql-action/init@c10b8064de6f491fea524254123dbe5e09572f13 # v4.35.1
       with:
         languages: ${{ matrix.language }}
         build-mode: ${{ matrix.build-mode }}
@@ -87,6 +87,6 @@ jobs:
         exit 1
 
     - name: Perform CodeQL Analysis
-      uses: github/codeql-action/analyze@v3
+      uses: github/codeql-action/analyze@c10b8064de6f491fea524254123dbe5e09572f13 # v4.35.1
       with:
         category: "/language:${{matrix.language}}"
diff --git a/.github/workflows/dependabot-merge.yml b/.github/workflows/dependabot-merge.yml
@@ -15,7 +15,7 @@ jobs:
     steps:
       - name: Dependabot metadata
         id: metadata
-        uses: dependabot/fetch-metadata@v1
+        uses: dependabot/fetch-metadata@ffa630c65fa7e0ecfa0625b5ceda64399aea1b36 # v3
         with:
           github-token: "${{ secrets.GITHUB_TOKEN }}"
       - name: Enable auto-merge for Dependabot PRs
diff --git a/.github/workflows/dependabot-rebase-stale.yml b/.github/workflows/dependabot-rebase-stale.yml
@@ -11,7 +11,7 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - name: "Rebase open Dependabot PR"
-        uses: orange-buffalo/dependabot-auto-rebase@v1
+        uses: orange-buffalo/dependabot-auto-rebase@fa9e05d7a8152381af0a92ffca942a0d46712544 # v1
         with:
           api-token: ${{ secrets.DEP_REBASE_PAT }}
           repository: ${{ github.repository }}
diff --git a/.github/workflows/package-availability-check.yml b/.github/workflows/package-availability-check.yml
@@ -15,7 +15,7 @@ jobs:
 
     steps:
       - name: Set up Python ${{ matrix.python-version }}
-        uses: actions/setup-python@v2
+        uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6
         with:
           python-version: ${{ matrix.python-version }}
       - name: Install dependencies using pip
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
@@ -62,13 +62,13 @@ jobs:
           fi
 
       - name: Checkout repository
-        uses: actions/checkout@v4
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6
         with:
           fetch-depth: 0
           token: ${{ secrets.GH_ACCESS_TOKEN }}
 
       - name: Install uv and set Python version
-        uses: astral-sh/setup-uv@v7
+        uses: astral-sh/setup-uv@cec208311dfd045dd5311c1add060b2062131d57 # v8
         with:
           version: "0.11.2"
           python-version: "3.12"
@@ -285,7 +285,7 @@ jobs:
 
       - name: Create GitHub Release
         id: create-release
-        uses: softprops/action-gh-release@v2
+        uses: softprops/action-gh-release@153bb8e04406b158c6c84fc1615b65b24149a1fe # v2
         with:
           tag_name: v${{ steps.new-version.outputs.version }}
           name: v${{ steps.new-version.outputs.version }}
@@ -299,8 +299,10 @@ jobs:
 
       - name: Notify Slack on success
         if: success()
-        uses: slackapi/slack-github-action@v1.26.0
+        uses: slackapi/slack-github-action@af78098f536edbc4de71162a307590698245be95 # v3
         with:
+          webhook: ${{ secrets.SLACK_WEBHOOK_RELEASES }}
+          webhook-type: incoming-webhook
           payload: |
             {
               "text": "✅ Langfuse Python SDK v${{ steps.new-version.outputs.version }} published to PyPI",
@@ -378,14 +380,13 @@ jobs:
                 }
               ]
             }
-        env:
-          SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_RELEASES }}
-          SLACK_WEBHOOK_TYPE: INCOMING_WEBHOOK
 
       - name: Notify Slack on failure
         if: failure()
-        uses: slackapi/slack-github-action@v1.26.0
+        uses: slackapi/slack-github-action@af78098f536edbc4de71162a307590698245be95 # v3
         with:
+          webhook: ${{ secrets.SLACK_WEBHOOK_ENGINEERING }}
+          webhook-type: incoming-webhook
           payload: |
             {
               "text": "❌ Langfuse Python SDK release workflow failed",
@@ -471,6 +472,3 @@ jobs:
                 }
               ]
             }
-        env:
-          SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_ENGINEERING }}
-          SLACK_WEBHOOK_TYPE: INCOMING_WEBHOOK
diff --git a/langfuse/_client/client.py b/langfuse/_client/client.py
@@ -2427,6 +2427,7 @@ def run_experiment(
             - run_name: The experiment run name. This is equal to the dataset run name if experiment was on Langfuse dataset.
             - item_results: List of results for each processed item with outputs and evaluations
             - run_evaluations: List of aggregate evaluation results for the entire run
+            - experiment_id: Stable identifier for the experiment run across all items
             - dataset_run_id: ID of the dataset run (if using Langfuse datasets)
             - dataset_run_url: Direct URL to view results in Langfuse UI (if applicable)
 
@@ -2577,6 +2578,8 @@ async def _run_experiment_async(
             f"Starting experiment '{name}' run '{run_name}' with {len(data)} items"
         )
 
+        shared_fallback_experiment_id = self._create_observation_id()
+
         # Set up concurrency control
         semaphore = asyncio.Semaphore(max_concurrency)
 
@@ -2588,6 +2591,7 @@ async def process_item(item: ExperimentItem) -> ExperimentItemResult:
                     task,
                     evaluators,
                     composite_evaluator,
+                    shared_fallback_experiment_id,
                     name,
                     run_name,
                     description,
@@ -2619,7 +2623,14 @@ async def process_item(item: ExperimentItem) -> ExperimentItemResult:
                 langfuse_logger.error(f"Run evaluator failed: {e}")
 
         # Generate dataset run URL if applicable
-        dataset_run_id = valid_results[0].dataset_run_id if valid_results else None
+        dataset_run_id = next(
+            (
+                result.dataset_run_id
+                for result in valid_results
+                if result.dataset_run_id
+            ),
+            None,
+        )
         dataset_run_url = None
         if dataset_run_id and data:
             try:
@@ -2665,6 +2676,7 @@ async def process_item(item: ExperimentItem) -> ExperimentItemResult:
             description=description,
             item_results=valid_results,
             run_evaluations=run_evaluations,
+            experiment_id=dataset_run_id or shared_fallback_experiment_id,
             dataset_run_id=dataset_run_id,
             dataset_run_url=dataset_run_url,
         )
@@ -2675,6 +2687,7 @@ async def _process_experiment_item(
         task: Callable,
         evaluators: List[Callable],
         composite_evaluator: Optional[CompositeEvaluatorFunction],
+        fallback_experiment_id: str,
         experiment_name: str,
         experiment_run_name: str,
         experiment_description: Optional[str],
@@ -2753,7 +2766,7 @@ async def _process_experiment_item(
                 if isinstance(item_metadata, dict):
                     final_observation_metadata.update(item_metadata)
 
-                experiment_id = dataset_run_id or self._create_observation_id()
+                experiment_id = dataset_run_id or fallback_experiment_id
                 experiment_item_id = (
                     dataset_item_id or get_sha256_hash_hex(_serialize(input_data))[:16]
                 )
diff --git a/langfuse/experiment.py b/langfuse/experiment.py
@@ -303,6 +303,9 @@ class ExperimentResult:
             containing the original item, task output, evaluations, and trace information.
         run_evaluations: List of aggregate evaluation results computed across all items,
             such as average scores, statistical summaries, or cross-item analyses.
+        experiment_id: ID of the experiment run propagated across all items. For
+            Langfuse datasets, this matches the dataset run ID. For local experiments,
+            this is a stable SDK-generated identifier for the run.
         dataset_run_id: Optional ID of the dataset run in Langfuse (when using Langfuse datasets).
         dataset_run_url: Optional direct URL to view the experiment results in Langfuse UI.
 
@@ -361,6 +364,7 @@ def __init__(
         description: Optional[str],
         item_results: List[ExperimentItemResult],
         run_evaluations: List[Evaluation],
+        experiment_id: str,
         dataset_run_id: Optional[str] = None,
         dataset_run_url: Optional[str] = None,
     ):
@@ -372,6 +376,7 @@ def __init__(
             description: Optional description of the experiment.
             item_results: List of results from processing individual dataset items.
             run_evaluations: List of aggregate evaluation results for the entire run.
+            experiment_id: ID of the experiment run.
             dataset_run_id: Optional ID of the dataset run (for Langfuse datasets).
             dataset_run_url: Optional URL to view results in Langfuse UI.
         """
@@ -380,6 +385,7 @@ def __init__(
         self.description = description
         self.item_results = item_results
         self.run_evaluations = run_evaluations
+        self.experiment_id = experiment_id
         self.dataset_run_id = dataset_run_id
         self.dataset_run_url = dataset_run_url
 
diff --git a/langfuse/langchain/CallbackHandler.py b/langfuse/langchain/CallbackHandler.py
@@ -303,6 +303,28 @@ def _parse_langfuse_trace_attributes(
 
         return attributes
 
+    def _get_langchain_observation_metadata(
+        self,
+        *,
+        parent_run_id: Optional[UUID],
+        tags: Optional[List[str]] = None,
+        metadata: Optional[Dict[str, Any]] = None,
+        keep_langfuse_trace_attributes: bool = False,
+    ) -> Optional[Dict[str, Any]]:
+        observation_metadata = self.__join_tags_and_metadata(
+            tags=tags,
+            metadata=metadata,
+            keep_langfuse_trace_attributes=keep_langfuse_trace_attributes,
+        )
+
+        if parent_run_id is not None:
+            return observation_metadata
+
+        root_metadata = observation_metadata.copy() if observation_metadata else {}
+        root_metadata["is_langchain_root"] = True
+
+        return root_metadata
+
     def on_chain_start(
         self,
         serialized: Optional[Dict[str, Any]],
@@ -325,7 +347,11 @@ def on_chain_start(
             )
 
             span_name = self.get_langchain_run_name(serialized, **kwargs)
-            span_metadata = self.__join_tags_and_metadata(tags, metadata)
+            span_metadata = self._get_langchain_observation_metadata(
+                parent_run_id=parent_run_id,
+                tags=tags,
+                metadata=metadata,
+            )
             span_level = "DEBUG" if tags and LANGSMITH_TAG_HIDDEN in tags else None
 
             observation_type = self._get_observation_type_from_serialized(
@@ -690,7 +716,11 @@ def on_tool_start(
                 "on_tool_start", run_id, parent_run_id, input_str=input_str
             )
 
-            meta = self.__join_tags_and_metadata(tags, metadata)
+            meta = self._get_langchain_observation_metadata(
+                parent_run_id=parent_run_id,
+                tags=tags,
+                metadata=metadata,
+            )
 
             if not meta:
                 meta = {}
@@ -734,7 +764,11 @@ def on_retriever_start(
                 "on_retriever_start", run_id, parent_run_id, query=query
             )
             span_name = self.get_langchain_run_name(serialized, **kwargs)
-            span_metadata = self.__join_tags_and_metadata(tags, metadata)
+            span_metadata = self._get_langchain_observation_metadata(
+                parent_run_id=parent_run_id,
+                tags=tags,
+                metadata=metadata,
+            )
             span_level = "DEBUG" if tags and LANGSMITH_TAG_HIDDEN in tags else None
 
             observation_type = self._get_observation_type_from_serialized(
@@ -865,9 +899,10 @@ def __on_llm_action(
             content = {
                 "name": self.get_langchain_run_name(serialized, **kwargs),
                 "input": prompts,
-                "metadata": self.__join_tags_and_metadata(
-                    tags,
-                    metadata,
+                "metadata": self._get_langchain_observation_metadata(
+                    parent_run_id=parent_run_id,
+                    tags=tags,
+                    metadata=metadata,
                     # If llm is run isolated and outside chain, keep trace attributes
                     keep_langfuse_trace_attributes=True
                     if parent_run_id is None
diff --git a/pyproject.toml b/pyproject.toml
diff --git a/tests/live_provider/test_langchain.py b/tests/live_provider/test_langchain.py
diff --git a/tests/unit/test_propagate_attributes.py b/tests/unit/test_propagate_attributes.py
diff --git a/uv.lock b/uv.lock