Skip to content

Commit 4b2f2d2

Browse files
committed
Rename metadata rebuild method and improve refresh methods
Why these changes are being introduced: The former TIMDEXDatasetMetadata method name recreate_static_database_file() was too narrowly focused. This method is responsible for rebuilding the entire dataset metadata structure. How this addresses that need: * Renames method * Updates refresh() methods on both TIMDEXDataset and TIMDEXDatasetMetadata to be more fully inclusive Side effects of this change: * None Relevant ticket(s): * https://mitlibraries.atlassian.net/browse/TIMX-543
1 parent 25d4430 commit 4b2f2d2

5 files changed

Lines changed: 24 additions & 17 deletions

File tree

tests/conftest.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -113,7 +113,7 @@ def timdex_dataset_multi_source(tmp_path_factory) -> TIMDEXDataset:
113113
)
114114

115115
# ensure static metadata database exists for read methods
116-
dataset.metadata.recreate_static_database_file()
116+
dataset.metadata.rebuild_dataset_metadata()
117117
dataset.metadata.refresh()
118118

119119
return dataset
@@ -223,7 +223,7 @@ def timdex_dataset_same_day_runs(tmp_path) -> TIMDEXDataset:
223223
def timdex_metadata(timdex_dataset_with_runs) -> TIMDEXDatasetMetadata:
224224
"""TIMDEXDatasetMetadata with static database file created."""
225225
metadata = TIMDEXDatasetMetadata(timdex_dataset_with_runs.location)
226-
metadata.recreate_static_database_file()
226+
metadata.rebuild_dataset_metadata()
227227
metadata.refresh()
228228
return metadata
229229

@@ -233,7 +233,7 @@ def timdex_dataset_with_runs_with_metadata(
233233
timdex_dataset_with_runs,
234234
) -> TIMDEXDataset:
235235
"""TIMDEXDataset with runs and static metadata created for read tests."""
236-
timdex_dataset_with_runs.metadata.recreate_static_database_file()
236+
timdex_dataset_with_runs.metadata.rebuild_dataset_metadata()
237237
timdex_dataset_with_runs.metadata.refresh()
238238
return timdex_dataset_with_runs
239239

tests/test_metadata.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ def test_tdm_s3_dataset_structure_properties(s3_bucket_mocked):
4343

4444
def test_tdm_create_metadata_database_file_success(caplog, timdex_metadata_empty):
4545
caplog.set_level("DEBUG")
46-
timdex_metadata_empty.recreate_static_database_file()
46+
timdex_metadata_empty.rebuild_dataset_metadata()
4747

4848

4949
def test_tdm_init_metadata_file_found_success(timdex_metadata):

tests/test_read.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -254,7 +254,7 @@ def test_dataset_load_current_records_gets_correct_same_day_full_run(
254254
timdex_dataset_same_day_runs,
255255
):
256256
# ensure metadata exists for this dataset
257-
timdex_dataset_same_day_runs.metadata.recreate_static_database_file()
257+
timdex_dataset_same_day_runs.metadata.rebuild_dataset_metadata()
258258
timdex_dataset_same_day_runs.metadata.refresh()
259259
df = timdex_dataset_same_day_runs.read_dataframe(
260260
table="current_records", run_type="full"
@@ -265,7 +265,7 @@ def test_dataset_load_current_records_gets_correct_same_day_full_run(
265265
def test_dataset_load_current_records_gets_correct_same_day_daily_runs_ordering(
266266
timdex_dataset_same_day_runs,
267267
):
268-
timdex_dataset_same_day_runs.metadata.recreate_static_database_file()
268+
timdex_dataset_same_day_runs.metadata.rebuild_dataset_metadata()
269269
timdex_dataset_same_day_runs.metadata.refresh()
270270
first_record = next(
271271
timdex_dataset_same_day_runs.read_dicts_iter(

timdex_dataset_api/dataset.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -143,6 +143,10 @@ def location_scheme(self) -> Literal["file", "s3"]:
143143
def data_records_root(self) -> str:
144144
return f"{self.location.removesuffix('/')}/data/records" # type: ignore[union-attr]
145145

146+
def refresh(self) -> None:
147+
"""Fully reload TIMDEXDataset instance."""
148+
self.__init__(self.location) # type: ignore[misc]
149+
146150
def create_data_structure(self) -> None:
147151
"""Ensure ETL records data structure exists in TIMDEX dataset."""
148152
if self.location_scheme == "file":

timdex_dataset_api/metadata.py

Lines changed: 14 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -249,16 +249,14 @@ def refresh(self) -> None:
249249
self.conn = self.setup_duckdb_context()
250250
self._sa_metadata = sa_reflect_duckdb_conn(self.conn, schema="metadata")
251251

252-
def recreate_static_database_file(self) -> None:
253-
"""Create/recreate the static metadata database file.
254-
255-
The following work is performed:
256-
1. Create a local working directory
257-
2. Open a DuckDB connection with a database file in this local working dir
258-
3. Create tables and views by scanning ETL data in dataset/data/records
259-
4. Close DuckDB connection ensuring a fully formed, local database file
260-
5. Upload DuckDB database file to target destination, making that the new
261-
static metadata database file
252+
def rebuild_dataset_metadata(self) -> None:
253+
"""Fully rebuild dataset metadata.
254+
255+
Work includes:
256+
- remove any append deltas, understanding a full metadata rebuild
257+
will pickup that data from the ETL records themselves
258+
- build a local, temporary static metadata database file, then overwrite the
259+
canonical version in the dataset (e.g. in S3)
262260
"""
263261
if self.location_scheme == "s3":
264262
s3_client = S3Client()
@@ -272,7 +270,6 @@ def recreate_static_database_file(self) -> None:
272270

273271
with duckdb.connect(local_db_path) as conn:
274272
self.configure_duckdb_connection(conn)
275-
conn.execute("""SET threads = 64;""")
276273

277274
self._create_full_dataset_table(conn)
278275

@@ -299,6 +296,9 @@ def _create_full_dataset_table(self, conn: DuckDBPyConnection) -> None:
299296
start_time = time.perf_counter()
300297
logger.info("creating table of full dataset metadata")
301298

299+
# temporarily increase thread count
300+
conn.execute("""SET threads = 64;""")
301+
302302
query = f"""
303303
create or replace table records as (
304304
select
@@ -312,6 +312,9 @@ def _create_full_dataset_table(self, conn: DuckDBPyConnection) -> None:
312312
"""
313313
conn.execute(query)
314314

315+
# reset thread count
316+
conn.execute(f"""SET threads = {self.config.duckdb_connection_threads};""")
317+
315318
row_count = conn.query("""select count(*) from records;""").fetchone()[0] # type: ignore[index]
316319
logger.info(
317320
f"'records' table created - rows: {row_count}, "

0 commit comments

Comments
 (0)