Skip to content

Commit bcaa7b3

Browse files
committed
Simplify TIMDEXRunManager interface
Why these changes are being introduced: It was a bit confusing, and required unneeded logic branching, if one should use .get_current_parquet_files() or .get_current_source_parquet_files(). How this addresses that need: * .get_current_parquet_files() becomes the public interface to use, now with an optional 'source' keyword argument Side effects of this change: * None Relevant ticket(s): * https://mitlibraries.atlassian.net/browse/TIMX-494
1 parent 7e0e795 commit bcaa7b3

3 files changed

Lines changed: 23 additions & 19 deletions

File tree

tests/test_runs.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,7 @@ def test_timdex_run_manager_get_all_current_run_parquet_files_success(
7272
def test_timdex_run_manager_get_source_current_run_parquet_files_success(
7373
timdex_run_manager,
7474
):
75-
ordered_parquet_files = timdex_run_manager.get_current_source_parquet_files("alma")
75+
ordered_parquet_files = timdex_run_manager._get_current_source_parquet_files("alma")
7676

7777
# assert 6 parquet files, despite being 8 total for 'alma' source
7878
# this represents the last full run and all daily since

timdex_dataset_api/dataset.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -167,12 +167,12 @@ def load(
167167
if current_records:
168168
timdex_run_manager = TIMDEXRunManager(timdex_dataset=self)
169169

170-
# if filters.source is set, further limit to only this source
171-
source = filters.get("source")
172-
if source:
173-
self.paths = timdex_run_manager.get_current_source_parquet_files(source)
174-
else:
175-
self.paths = timdex_run_manager.get_current_parquet_files()
170+
# update paths, limiting by source if set
171+
self.paths = timdex_run_manager.get_current_parquet_files(
172+
source=filters.get("source")
173+
)
174+
175+
# reload pyarrow dataset
176176
self._load_pyarrow_dataset()
177177

178178
# filter dataset

timdex_dataset_api/run.py

Lines changed: 16 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,22 @@ def get_runs_metadata(self, *, refresh: bool = False) -> pd.DataFrame:
8585
)
8686
return grouped_runs_df
8787

88-
def get_current_source_parquet_files(self, source: str) -> list[str]:
88+
def get_current_parquet_files(self, source: str | None = None) -> list[str]:
89+
"""Get reverse chronological list of parquet files associated with current runs.
90+
91+
Args:
92+
source: if provided, limits parquet files to only that source
93+
"""
94+
runs_df = self.get_runs_metadata() # run metadata is cached for future calls
95+
sources = [source] if source else list(runs_df.source.unique())
96+
97+
source_parquet_files = []
98+
for _source in sources:
99+
source_parquet_files.extend(self._get_current_source_parquet_files(_source))
100+
101+
return source_parquet_files
102+
103+
def _get_current_source_parquet_files(self, source: str) -> list[str]:
89104
"""Get reverse chronological list of current parquet files for a source.
90105
91106
Args:
@@ -115,17 +130,6 @@ def get_current_source_parquet_files(self, source: str) -> list[str]:
115130

116131
return ordered_parquet_files
117132

118-
def get_current_parquet_files(self) -> list[str]:
119-
"""Get reverse chronological list of current parquet files for ALL sources."""
120-
runs_df = self.get_runs_metadata() # run metadata is cached for future calls
121-
sources = list(runs_df.source.unique())
122-
123-
source_parquet_files = []
124-
for source in sources:
125-
source_parquet_files.extend(self.get_current_source_parquet_files(source))
126-
127-
return source_parquet_files
128-
129133
def _get_parquet_files_run_metadata(self, max_workers: int = 250) -> pd.DataFrame:
130134
"""Retrieve run metadata from parquet file(s) in dataset.
131135

0 commit comments

Comments
 (0)