Reorder methods and expand docstrings

ghukill · ghukill · commit bd3b93705ebd · 2025-05-21T09:25:07.000-04:00
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -120,7 +120,7 @@ def dataset_with_runs_location(tmp_path) -> str:
         ]
     )
 
-    # simulate ETL runs for 'alma'
+    # simulate ETL runs for 'dspace'
     run_params.extend(
         [
             (30, "dspace", "2024-12-02", "full", "index", "run-8"),
diff --git a/tests/test_runs.py b/tests/test_runs.py
@@ -25,7 +25,7 @@ def test_timdex_run_manager_parse_single_parquet_file_success(timdex_run_manager
     """Parse run metadata from first parquet file in fixture dataset.  We know the details
     of this ETL run in advance given the deterministic fixture that generated it."""
     parquet_filepath = timdex_run_manager.timdex_dataset.dataset.files[0]
-    run_metadata = timdex_run_manager.parse_run_metadata_from_parquet_file(
+    run_metadata = timdex_run_manager._parse_run_metadata_from_parquet_file(
         parquet_filepath
     )
     assert run_metadata["source"] == "alma"
@@ -37,7 +37,7 @@ def test_timdex_run_manager_parse_single_parquet_file_success(timdex_run_manager
 
 
 def test_timdex_run_manager_parse_multiple_parquet_files(timdex_run_manager):
-    parquet_metadata_df = timdex_run_manager.get_parquet_files_run_metadata()
+    parquet_metadata_df = timdex_run_manager._get_parquet_files_run_metadata()
 
     # assert 16 rows for this per-file dataframe, despite only 14 distinct ETL "runs"
     assert len(parquet_metadata_df) == 16
@@ -75,7 +75,7 @@ def test_timdex_run_manager_caches_runs_dataframe(timdex_run_manager):
     assert timdex_run_manager._runs_metadata_cache is not None
 
     with patch.object(
-        timdex_run_manager, "get_parquet_files_run_metadata"
+        timdex_run_manager, "_get_parquet_files_run_metadata"
     ) as mocked_intermediate_method:
         mocked_intermediate_method.side_effect = Exception(
             "I am not reached, cache is used."
diff --git a/timdex_dataset_api/run.py b/timdex_dataset_api/run.py
@@ -27,70 +27,17 @@ def __init__(self, timdex_dataset: "TIMDEXDataset"):
     def clear_cache(self) -> None:
         self._runs_metadata_cache = None
 
-    def parse_run_metadata_from_parquet_file(self, parquet_filepath: str) -> dict:
-        """Parse source, run_date, run_type, and run_id from a single Parquet file.
-
-        Args:
-            parquet_filepath: Path to the parquet file
-        """
-        parquet_file = pq.ParquetFile(
-            parquet_filepath,
-            filesystem=self.timdex_dataset.filesystem,  # type: ignore[union-attr]
-        )
-        file_meta = parquet_file.metadata.to_dict()
-        num_rows = file_meta["num_rows"]
-        columns_meta = file_meta["row_groups"][0]["columns"]  # type: ignore[typeddict-item]
-        source = columns_meta[3]["statistics"]["max"]
-        run_date = columns_meta[4]["statistics"]["max"]
-        run_type = columns_meta[5]["statistics"]["max"]
-        run_id = columns_meta[7]["statistics"]["max"]
-
-        return {
-            "source": source,
-            "run_date": run_date,
-            "run_type": run_type,
-            "run_id": run_id,
-            "num_rows": num_rows,
-            "filename": parquet_filepath,
-        }
-
-    def get_parquet_files_run_metadata(self, max_workers: int = 250) -> pd.DataFrame:
-        """Retrieve run metadata from parquet file(s) in dataset.
-
-        A single ETL run may still be spread across multiple Parquet files making this
-        data ungrouped by run.
-
-        Args:
-            max_workers: Maximum number of parallel workers for processing
-                - a high number is generally safe given the lightweight nature of the
-                thread's work, just reading a few parquet file header bytes
-        """
-        with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
-            futures = []
-            for parquet_filepath in self.timdex_dataset.dataset.files:  # type: ignore[attr-defined]
-                future = executor.submit(
-                    self.parse_run_metadata_from_parquet_file,
-                    parquet_filepath,
-                )
-                futures.append(future)
-
-            done, not_done = concurrent.futures.wait(
-                futures, return_when=concurrent.futures.ALL_COMPLETED
-            )
-
-            results = []
-            for future in done:
-                try:
-                    if result := future.result():
-                        results.append(result)
-                except Exception:
-                    logger.exception("Error reading run metadata from parquet file.")
-
-        return pd.DataFrame(results) if results else pd.DataFrame()
-
     def get_runs_metadata(self, *, refresh: bool = False) -> pd.DataFrame:
         """Get metadata for all runs in dataset, grouped by run_id.
 
+        The dataframe returned includes the following columns:
+            - source
+            - run_date
+            - run_type
+            - run_id
+            - num_rows: total number of records for that run_id
+            - parquet_files: list of parquet file(s) that are associated with that run
+
         Args:
             refresh: If True, force refresh of cached metadata
         """
@@ -99,7 +46,7 @@ def get_runs_metadata(self, *, refresh: bool = False) -> pd.DataFrame:
         if self._runs_metadata_cache is not None and not refresh:
             return self._runs_metadata_cache
 
-        ungrouped_runs_df = self.get_parquet_files_run_metadata()
+        ungrouped_runs_df = self._get_parquet_files_run_metadata()
         if ungrouped_runs_df.empty:
             return ungrouped_runs_df
 
@@ -167,3 +114,73 @@ def get_current_source_parquet_files(self, source: str) -> list[str]:
         ordered_parquet_files.extend(last_full_run.parquet_files)
 
         return ordered_parquet_files
+
+    def _get_parquet_files_run_metadata(self, max_workers: int = 250) -> pd.DataFrame:
+        """Retrieve run metadata from parquet file(s) in dataset.
+
+        A single ETL run may still be spread across multiple Parquet files making this
+        data ungrouped by run.
+
+        Args:
+            max_workers: Maximum number of parallel workers for processing
+                - a high number is generally safe given the lightweight nature of the
+                thread's work, just reading a few parquet file header bytes
+        """
+        with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
+            futures = []
+            for parquet_filepath in self.timdex_dataset.dataset.files:  # type: ignore[attr-defined]
+                future = executor.submit(
+                    self._parse_run_metadata_from_parquet_file,
+                    parquet_filepath,
+                )
+                futures.append(future)
+
+            done, not_done = concurrent.futures.wait(
+                futures, return_when=concurrent.futures.ALL_COMPLETED
+            )
+
+            results = []
+            for future in done:
+                try:
+                    if result := future.result():
+                        results.append(result)
+                except Exception:
+                    logger.exception("Error reading run metadata from parquet file.")
+
+        return pd.DataFrame(results) if results else pd.DataFrame()
+
+    def _parse_run_metadata_from_parquet_file(self, parquet_filepath: str) -> dict:
+        """Parse source, run_date, run_type, and run_id from a single Parquet file.
+
+        The TIMDEX parquet dataset has a characteristic that we can use for extracting
+        run information from a single row in a parquet file: all rows in the parquet file
+        share the column values source, run_date, run_type, and run_id.
+
+        Taking this a step further, we can extract these values without even touching a
+        single proper row from the parquet file, but from reading the parquet file
+        column statistics.  In this way, we can extract run information from a parquet
+        file by only reading the lightweight parquet file metadata.
+
+        Args:
+            parquet_filepath: Path to the parquet file
+        """
+        parquet_file = pq.ParquetFile(
+            parquet_filepath,
+            filesystem=self.timdex_dataset.filesystem,  # type: ignore[union-attr]
+        )
+        file_meta = parquet_file.metadata.to_dict()
+        num_rows = file_meta["num_rows"]
+        columns_meta = file_meta["row_groups"][0]["columns"]  # type: ignore[typeddict-item]
+        source = columns_meta[3]["statistics"]["max"]
+        run_date = columns_meta[4]["statistics"]["max"]
+        run_type = columns_meta[5]["statistics"]["max"]
+        run_id = columns_meta[7]["statistics"]["max"]
+
+        return {
+            "source": source,
+            "run_date": run_date,
+            "run_type": run_type,
+            "run_id": run_id,
+            "num_rows": num_rows,
+            "filename": parquet_filepath,
+        }

Original file line number	Diff line number	Diff line change
`@@ -120,7 +120,7 @@ def dataset_with_runs_location(tmp_path) -> str:`
`120`	`120`	`]`
`121`	`121`	`)`
`122`	`122`
`123`		`- # simulate ETL runs for 'alma'`
	`123`	`+ # simulate ETL runs for 'dspace'`
`124`	`124`	`run_params.extend(`
`125`	`125`	`[`
`126`	`126`	`(30, "dspace", "2024-12-02", "full", "index", "run-8"),`