Simplify TIMDEXRunManager interface

ghukill · ghukill · commit bcaa7b3d9761 · 2025-05-21T14:40:20.000-04:00
Why these changes are being introduced: It was a bit confusing, and required unneeded logic branching, if one should use .get_current_parquet_files() or .get_current_source_parquet_files(). How this addresses that need: * .get_current_parquet_files() becomes the public interface to use, now with an optional 'source' keyword argument Side effects of this change: * None Relevant ticket(s): * https://mitlibraries.atlassian.net/browse/TIMX-494
diff --git a/tests/test_runs.py b/tests/test_runs.py
@@ -72,7 +72,7 @@ def test_timdex_run_manager_get_all_current_run_parquet_files_success(
 def test_timdex_run_manager_get_source_current_run_parquet_files_success(
     timdex_run_manager,
 ):
-    ordered_parquet_files = timdex_run_manager.get_current_source_parquet_files("alma")
+    ordered_parquet_files = timdex_run_manager._get_current_source_parquet_files("alma")
 
     # assert 6 parquet files, despite being 8 total for 'alma' source
     # this represents the last full run and all daily since
diff --git a/timdex_dataset_api/dataset.py b/timdex_dataset_api/dataset.py
@@ -167,12 +167,12 @@ def load(
         if current_records:
             timdex_run_manager = TIMDEXRunManager(timdex_dataset=self)
 
-            # if filters.source is set, further limit to only this source
-            source = filters.get("source")
-            if source:
-                self.paths = timdex_run_manager.get_current_source_parquet_files(source)
-            else:
-                self.paths = timdex_run_manager.get_current_parquet_files()
+            # update paths, limiting by source if set
+            self.paths = timdex_run_manager.get_current_parquet_files(
+                source=filters.get("source")
+            )
+
+            # reload pyarrow dataset
             self._load_pyarrow_dataset()
 
         # filter dataset
diff --git a/timdex_dataset_api/run.py b/timdex_dataset_api/run.py
@@ -85,7 +85,22 @@ def get_runs_metadata(self, *, refresh: bool = False) -> pd.DataFrame:
         )
         return grouped_runs_df
 
-    def get_current_source_parquet_files(self, source: str) -> list[str]:
+    def get_current_parquet_files(self, source: str | None = None) -> list[str]:
+        """Get reverse chronological list of parquet files associated with current runs.
+
+        Args:
+            source: if provided, limits parquet files to only that source
+        """
+        runs_df = self.get_runs_metadata()  # run metadata is cached for future calls
+        sources = [source] if source else list(runs_df.source.unique())
+
+        source_parquet_files = []
+        for _source in sources:
+            source_parquet_files.extend(self._get_current_source_parquet_files(_source))
+
+        return source_parquet_files
+
+    def _get_current_source_parquet_files(self, source: str) -> list[str]:
         """Get reverse chronological list of current parquet files for a source.
 
         Args:
@@ -115,17 +130,6 @@ def get_current_source_parquet_files(self, source: str) -> list[str]:
 
         return ordered_parquet_files
 
-    def get_current_parquet_files(self) -> list[str]:
-        """Get reverse chronological list of current parquet files for ALL sources."""
-        runs_df = self.get_runs_metadata()  # run metadata is cached for future calls
-        sources = list(runs_df.source.unique())
-
-        source_parquet_files = []
-        for source in sources:
-            source_parquet_files.extend(self.get_current_source_parquet_files(source))
-
-        return source_parquet_files
-
     def _get_parquet_files_run_metadata(self, max_workers: int = 250) -> pd.DataFrame:
         """Retrieve run metadata from parquet file(s) in dataset.