TIMDEXRunManager for producing ETL run metadata

ghukill · ghukill · commit f584a1dc84c9 · 2025-05-20T13:55:26.000-04:00
Why these changes are being introduced: One of the challenges the architecture of the TIMDEX parquet dataset presents is quick and easy metadata about ETL "runs" in the dataset. The year/month/day partitioning structure is very efficient for accessing a run if you know the date, where only a few parquet files are scanned, but it's not geared towards quickly isolating runs (parquet files) associated with a given source. Having metadata about runs provides a map to efficiently access meaningful subsets of data. One example would be fully refreshing a source in Opensearch. To do, you'd want to access all runs for a given source since, and including, the last run_type=full run. Those runs represent the current state of the source in TIMDEX. Unfortunately, this is not terribly efficient to naively perform with pyarrow or DuckDB, where potentially thousands of parquet files are touched. Similar to how Apache Iceberg (a parquet dataset architecture) works, we need some metadata about each "run" in the dataset which correlates to parquet file(s). How this addresses that need: A new class TIMDEXRunManager exists to provide this functionality. This class will produce a pandas dataframe of metadata about all runs in the dataset, including the explicit parquet filepath the run is associated with, in a highly efficient and parallelized way. The is achieved by: 1. Getting a list of all parquet files from the dataset. 2. Reading the *first* row from each file, which contains metadata about the run that produced the file. 3. Aggregating the results and grouping by "run_id". The result is a dataframe that provides a precise map of run metadata to parquet files in the dataset. With those parquet files identified, this unblocks further functionality for this library like "replaying" the runs for a given source in chronological order to refresh it in Opensearch. Side effects of this change: * None. No changes are made to pre-existing functionality, just the addition of this new information gathering class. Relevant ticket(s): * https://mitlibraries.atlassian.net/browse/TIMX-493 * https://mitlibraries.atlassian.net/browse/TIMX-494
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -2,6 +2,7 @@
 
 # ruff: noqa: D205, D209
 
+import os
 
 import pytest
 
@@ -10,6 +11,7 @@
     generate_sample_records_with_simulated_partitions,
 )
 from timdex_dataset_api import TIMDEXDataset
+from timdex_dataset_api.dataset import TIMDEXDatasetConfig
 
 
 @pytest.fixture(autouse=True)
@@ -90,3 +92,58 @@ def _records_iter(num_records):
         )
 
     return _records_iter
+
+
+@pytest.fixture
+def dataset_with_runs_location(tmp_path) -> str:
+    """Fixture to simulate a dataset with multiple full and daily ETL runs."""
+    location = str(tmp_path / "dataset_with_runs")
+    os.mkdir(location)
+
+    timdex_dataset = TIMDEXDataset(
+        location, config=TIMDEXDatasetConfig(max_rows_per_group=75, max_rows_per_file=75)
+    )
+    timdex_dataset.load()
+
+    run_params = []
+
+    # simulate ETL runs for 'alma'
+    run_params.extend(
+        [
+            (40, "alma", "2024-12-01", "full", "index", "run-1"),
+            (20, "alma", "2024-12-15", "daily", "index", "run-2"),
+            (100, "alma", "2025-01-01", "full", "index", "run-3"),
+            (50, "alma", "2025-01-02", "daily", "index", "run-4"),
+            (25, "alma", "2025-01-03", "daily", "index", "run-5"),
+            (10, "alma", "2025-01-04", "daily", "delete", "run-6"),
+            (9, "alma", "2025-01-05", "daily", "index", "run-7"),
+        ]
+    )
+
+    # simulate ETL runs for 'alma'
+    run_params.extend(
+        [
+            (30, "dspace", "2024-12-02", "full", "index", "run-8"),
+            (10, "dspace", "2024-12-16", "daily", "index", "run-9"),
+            (90, "dspace", "2025-02-01", "full", "index", "run-10"),
+            (40, "dspace", "2025-02-02", "daily", "index", "run-11"),
+            (15, "dspace", "2025-02-03", "daily", "index", "run-12"),
+            (5, "dspace", "2025-02-04", "daily", "delete", "run-13"),
+            (4, "dspace", "2025-02-05", "daily", "index", "run-14"),
+        ]
+    )
+
+    # write to dataset
+    for params in run_params:
+        num_records, source, run_date, run_type, action, run_id = params
+        records = generate_sample_records(
+            num_records,
+            source=source,
+            run_date=run_date,
+            run_type=run_type,
+            action=action,
+            run_id=run_id,
+        )
+        timdex_dataset.write(records)
+
+    return location
diff --git a/tests/test_runs.py b/tests/test_runs.py
@@ -0,0 +1,85 @@
+# ruff: noqa: SLF001, D205, D209, PLR2004
+
+import datetime
+from unittest.mock import patch
+
+import pytest
+
+from timdex_dataset_api import TIMDEXDataset
+from timdex_dataset_api.run import TIMDEXRunManager
+
+
+@pytest.fixture
+def timdex_run_manager(dataset_with_runs_location):
+    timdex_dataset = TIMDEXDataset(dataset_with_runs_location)
+    return TIMDEXRunManager(timdex_dataset=timdex_dataset)
+
+
+def test_timdex_run_manager_init(dataset_with_runs_location):
+    timdex_dataset = TIMDEXDataset(dataset_with_runs_location)
+    timdex_run_manager = TIMDEXRunManager(timdex_dataset=timdex_dataset)
+    assert timdex_run_manager._runs_metadata_cache is None
+
+
+def test_timdex_run_manager_parse_single_parquet_file_success(timdex_run_manager):
+    """Parse run metadata from first parquet file in fixture dataset.  We know the details
+    of this ETL run in advance given the deterministic fixture that generated it."""
+    parquet_filepath = timdex_run_manager.timdex_dataset.dataset.files[0]
+    run_metadata = timdex_run_manager.parse_run_metadata_from_parquet_file(
+        parquet_filepath
+    )
+    assert run_metadata["source"] == "alma"
+    assert run_metadata["run_date"] == datetime.date(2024, 12, 1)
+    assert run_metadata["run_type"] == "full"
+    assert run_metadata["run_id"] == "run-1"
+    assert run_metadata["num_rows"] == 40
+    assert run_metadata["filename"] == parquet_filepath
+
+
+def test_timdex_run_manager_parse_multiple_parquet_files(timdex_run_manager):
+    parquet_metadata_df = timdex_run_manager.get_parquet_files_run_metadata()
+
+    # assert 16 rows for this per-file dataframe, despite only 14 distinct ETL "runs"
+    assert len(parquet_metadata_df) == 16
+
+    # assert each source has metadata for 8 parquet files
+    assert parquet_metadata_df.source.value_counts().to_dict() == {"alma": 8, "dspace": 8}
+
+
+def test_timdex_run_manager_get_runs_df(timdex_run_manager):
+    runs_df = timdex_run_manager.get_runs_metadata()
+
+    # assert two "large" runs have multiple parquet files
+    assert len(runs_df[runs_df.parquet_files_count > 1]) == 2
+
+    # assert 7 distinct runs per source, despite more parquet files
+    assert runs_df.source.value_counts().to_dict() == {"alma": 7, "dspace": 7}
+
+
+def test_timdex_run_manager_get_source_current_run_parquet_files_success(
+    timdex_run_manager,
+):
+    ordered_parquet_files = timdex_run_manager.get_current_source_parquet_files("alma")
+
+    # assert 6 parquet files, despite being 8 total for alma
+    # this represents the last full run and all daily since
+    assert len(ordered_parquet_files)
+
+    # assert sorted reverse chronologically
+    assert "year=2025/month=01/day=05" in ordered_parquet_files[0]
+    assert "year=2025/month=01/day=01" in ordered_parquet_files[-1]
+
+
+def test_timdex_run_manager_caches_runs_dataframe(timdex_run_manager):
+    runs_df = timdex_run_manager.get_runs_metadata()
+    assert timdex_run_manager._runs_metadata_cache is not None
+
+    with patch.object(
+        timdex_run_manager, "get_parquet_files_run_metadata"
+    ) as mocked_intermediate_method:
+        mocked_intermediate_method.side_effect = Exception(
+            "I am not reached, cache is used."
+        )
+        runs_df_2 = timdex_run_manager.get_runs_metadata()
+
+    assert runs_df.equals(runs_df_2)
diff --git a/timdex_dataset_api/run.py b/timdex_dataset_api/run.py
@@ -0,0 +1,169 @@
+"""timdex_dataset_api/run.py"""
+
+import concurrent.futures
+import logging
+import time
+from typing import TYPE_CHECKING
+
+import pandas as pd
+import pyarrow.parquet as pq
+
+if TYPE_CHECKING:
+    from timdex_dataset_api.dataset import TIMDEXDataset
+
+logger = logging.getLogger(__name__)
+
+
+class TIMDEXRunManager:
+    """Manages and provides access to ETL run metadata from the TIMDEX parquet dataset."""
+
+    def __init__(self, timdex_dataset: "TIMDEXDataset"):
+        self.timdex_dataset: TIMDEXDataset = timdex_dataset
+        if self.timdex_dataset.dataset is None:
+            self.timdex_dataset.load()
+
+        self._runs_metadata_cache: pd.DataFrame | None = None
+
+    def clear_cache(self) -> None:
+        self._runs_metadata_cache = None
+
+    def parse_run_metadata_from_parquet_file(self, parquet_filepath: str) -> dict:
+        """Parse source, run_date, run_type, and run_id from a single Parquet file.
+
+        Args:
+            parquet_filepath: Path to the parquet file
+        """
+        parquet_file = pq.ParquetFile(
+            parquet_filepath,
+            filesystem=self.timdex_dataset.filesystem,  # type: ignore[union-attr]
+        )
+        file_meta = parquet_file.metadata.to_dict()
+        num_rows = file_meta["num_rows"]
+        columns_meta = file_meta["row_groups"][0]["columns"]  # type: ignore[typeddict-item]
+        source = columns_meta[3]["statistics"]["max"]
+        run_date = columns_meta[4]["statistics"]["max"]
+        run_type = columns_meta[5]["statistics"]["max"]
+        run_id = columns_meta[7]["statistics"]["max"]
+
+        return {
+            "source": source,
+            "run_date": run_date,
+            "run_type": run_type,
+            "run_id": run_id,
+            "num_rows": num_rows,
+            "filename": parquet_filepath,
+        }
+
+    def get_parquet_files_run_metadata(self, max_workers: int = 250) -> pd.DataFrame:
+        """Retrieve run metadata from parquet file(s) in dataset.
+
+        A single ETL run may still be spread across multiple Parquet files making this
+        data ungrouped by run.
+
+        Args:
+            max_workers: Maximum number of parallel workers for processing
+                - a high number is generally safe given the lightweight nature of the
+                thread's work, just reading a few parquet file header bytes
+        """
+        with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
+            futures = []
+            for parquet_filepath in self.timdex_dataset.dataset.files:  # type: ignore[attr-defined]
+                future = executor.submit(
+                    self.parse_run_metadata_from_parquet_file,
+                    parquet_filepath,
+                )
+                futures.append(future)
+
+            done, not_done = concurrent.futures.wait(
+                futures, return_when=concurrent.futures.ALL_COMPLETED
+            )
+
+            results = []
+            for future in done:
+                try:
+                    if result := future.result():
+                        results.append(result)
+                except Exception:
+                    logger.exception("Error reading run metadata from parquet file.")
+
+        return pd.DataFrame(results) if results else pd.DataFrame()
+
+    def get_runs_metadata(self, *, refresh: bool = False) -> pd.DataFrame:
+        """Get metadata for all runs in dataset, grouped by run_id.
+
+        Args:
+            refresh: If True, force refresh of cached metadata
+        """
+        start_time = time.perf_counter()
+
+        if self._runs_metadata_cache is not None and not refresh:
+            return self._runs_metadata_cache
+
+        ungrouped_runs_df = self.get_parquet_files_run_metadata()
+        if ungrouped_runs_df.empty:
+            return ungrouped_runs_df
+
+        # group by run_id
+        grouped_runs_df = (
+            ungrouped_runs_df.groupby("run_id")
+            .agg(
+                {
+                    "source": "first",
+                    "run_date": "first",
+                    "run_type": "first",
+                    "num_rows": "sum",
+                    "filename": list,
+                }
+            )
+            .reset_index()
+        )
+
+        # add additional metadata
+        grouped_runs_df = grouped_runs_df.rename(columns={"filename": "parquet_files"})
+        grouped_runs_df["parquet_files_count"] = grouped_runs_df["parquet_files"].apply(
+            lambda x: len(x)
+        )
+
+        # sort by run date and source
+        grouped_runs_df = grouped_runs_df.sort_values(
+            ["run_date", "source"], ascending=False
+        )
+
+        # cache the result
+        self._runs_metadata_cache = grouped_runs_df
+
+        logger.info(
+            f"Dataset runs metadata retrieved, elapsed: "
+            f"{round(time.perf_counter() - start_time, 2)}s, runs: {len(grouped_runs_df)}"
+        )
+        return grouped_runs_df
+
+    def get_current_source_parquet_files(self, source: str) -> list[str]:
+        """Get reverse chronological list of current parquet files for a source.
+
+        Args:
+            source: The source identifier to filter runs
+        """
+        runs_df = self.get_runs_metadata()
+        source_runs_df = runs_df[runs_df.source == source].copy()
+
+        # get last "full" run
+        full_runs_df = source_runs_df[source_runs_df.run_type == "full"]
+        if len(full_runs_df) == 0:
+            raise RuntimeError(
+                f"Could not find the most recent 'full' run for source: '{source}'"
+            )
+        last_full_run = full_runs_df.iloc[0]
+
+        # get all "daily" runs since
+        daily_runs_df = source_runs_df[
+            (source_runs_df.run_type == "daily")
+            & (source_runs_df.run_date >= last_full_run.run_date)
+        ]
+
+        ordered_parquet_files = []
+        for _, daily_run in daily_runs_df.iterrows():
+            ordered_parquet_files.extend(daily_run.parquet_files)
+        ordered_parquet_files.extend(last_full_run.parquet_files)
+
+        return ordered_parquet_files