Skip to content

Commit 9f0d74b

Browse files
committed
Reduce TIMDEXRunManager and TIMDEXDataset coupling
Why these changes are being introduced: Formerly, an instance of TIMDEXRunManager expected a TIMDEXDataset on init, where it would utilize the pyarrow TIMDEXDataset.dataset. This results in an unneeded tightly coupling betweent these classes. How this addresses that need: * TIMDEXRunManager updated to only expect a pyarrow Dataset Side effects of this change: * None Relevant ticket(s): * https://mitlibraries.atlassian.net/browse/TIMX-496
1 parent 00b8d2a commit 9f0d74b

3 files changed

Lines changed: 11 additions & 15 deletions

File tree

tests/test_runs.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,19 +12,21 @@
1212
@pytest.fixture
1313
def timdex_run_manager(dataset_with_runs_location):
1414
timdex_dataset = TIMDEXDataset(dataset_with_runs_location)
15-
return TIMDEXRunManager(timdex_dataset=timdex_dataset)
15+
timdex_dataset.load()
16+
return TIMDEXRunManager(dataset=timdex_dataset.dataset)
1617

1718

1819
def test_timdex_run_manager_init(dataset_with_runs_location):
1920
timdex_dataset = TIMDEXDataset(dataset_with_runs_location)
20-
timdex_run_manager = TIMDEXRunManager(timdex_dataset=timdex_dataset)
21+
timdex_dataset.load()
22+
timdex_run_manager = TIMDEXRunManager(dataset=timdex_dataset.dataset)
2123
assert timdex_run_manager._runs_metadata_cache is None
2224

2325

2426
def test_timdex_run_manager_parse_single_parquet_file_success(timdex_run_manager):
2527
"""Parse run metadata from first parquet file in fixture dataset. We know the details
2628
of this ETL run in advance given the deterministic fixture that generated it."""
27-
parquet_filepath = timdex_run_manager.timdex_dataset.dataset.files[0]
29+
parquet_filepath = timdex_run_manager.dataset.files[0]
2830
run_metadata = timdex_run_manager._parse_run_metadata_from_parquet_file(
2931
parquet_filepath
3032
)

timdex_dataset_api/dataset.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -167,7 +167,7 @@ def load(
167167
if current_records:
168168
timdex_run_manager = TIMDEXRunManager(timdex_dataset=self)
169169

170-
# update paths, limiting by source if set
170+
timdex_run_manager = TIMDEXRunManager(dataset=self.dataset)
171171
self.paths = timdex_run_manager.get_current_parquet_files(
172172
source=filters.get("source")
173173
)

timdex_dataset_api/run.py

Lines changed: 5 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -3,25 +3,19 @@
33
import concurrent.futures
44
import logging
55
import time
6-
from typing import TYPE_CHECKING
76

87
import pandas as pd
8+
import pyarrow.dataset as ds
99
import pyarrow.parquet as pq
1010

11-
if TYPE_CHECKING:
12-
from timdex_dataset_api.dataset import TIMDEXDataset
13-
1411
logger = logging.getLogger(__name__)
1512

1613

1714
class TIMDEXRunManager:
1815
"""Manages and provides access to ETL run metadata from the TIMDEX parquet dataset."""
1916

20-
def __init__(self, timdex_dataset: "TIMDEXDataset"):
21-
self.timdex_dataset: TIMDEXDataset = timdex_dataset
22-
if self.timdex_dataset.dataset is None:
23-
self.timdex_dataset.load()
24-
17+
def __init__(self, dataset: ds.Dataset):
18+
self.dataset = dataset
2519
self._runs_metadata_cache: pd.DataFrame | None = None
2620

2721
def clear_cache(self) -> None:
@@ -143,7 +137,7 @@ def _get_parquet_files_run_metadata(self, max_workers: int = 250) -> pd.DataFram
143137
"""
144138
with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
145139
futures = []
146-
for parquet_filepath in self.timdex_dataset.dataset.files: # type: ignore[attr-defined]
140+
for parquet_filepath in self.dataset.files: # type: ignore[attr-defined]
147141
future = executor.submit(
148142
self._parse_run_metadata_from_parquet_file,
149143
parquet_filepath,
@@ -181,7 +175,7 @@ def _parse_run_metadata_from_parquet_file(self, parquet_filepath: str) -> dict:
181175
"""
182176
parquet_file = pq.ParquetFile(
183177
parquet_filepath,
184-
filesystem=self.timdex_dataset.filesystem,
178+
filesystem=self.dataset.filesystem, # type: ignore[attr-defined]
185179
)
186180

187181
file_meta = parquet_file.metadata.to_dict()

0 commit comments

Comments
 (0)