Skip to content

Commit 3efe8b7

Browse files
authored
Merge pull request #157 from MITLibraries/TIMX-530-create-static-metadata-db-file
TIMX 530 - rebuild TIMDEXDatasetMetadata with static database file approach
2 parents 3311978 + aaaadd0 commit 3efe8b7

9 files changed

Lines changed: 297 additions & 463 deletions

File tree

migrations/002_2025_06_25_consistent_run_timestamp_per_etl_run.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,7 @@
1-
# ruff: noqa: BLE001, D212, TRY300, TRY400
1+
# ruff: noqa: PGH004
2+
# ruff: noqa
3+
# type: ignore
4+
25
"""
36
Date: 2025-06-25
47
@@ -29,6 +32,10 @@
2932
pipenv run python migrations/002_2025_06_25_consistent_run_timestamp_per_etl_run.py \
3033
<DATASET_LOCATION> \
3134
--dry-run
35+
36+
Update: 2025-08-04
37+
38+
This migration is no longer functional given changes to TIMDEXDataset.
3239
"""
3340

3441
import argparse

tests/conftest.py

Lines changed: 14 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -137,7 +137,7 @@ def dataset_with_runs_location(tmp_path) -> str:
137137

138138

139139
@pytest.fixture
140-
def local_dataset_with_runs(dataset_with_runs_location) -> TIMDEXDataset:
140+
def dataset_with_runs(dataset_with_runs_location) -> TIMDEXDataset:
141141
return TIMDEXDataset(dataset_with_runs_location)
142142

143143

@@ -195,19 +195,26 @@ def dataset_with_same_day_runs(tmp_path) -> TIMDEXDataset:
195195
return timdex_dataset
196196

197197

198-
@pytest.fixture
199-
def timdex_dataset_metadata(dataset_with_same_day_runs):
200-
return TIMDEXDatasetMetadata(timdex_dataset=dataset_with_same_day_runs)
201-
202-
203198
@pytest.fixture
204199
def timdex_bucket():
205200
return "timdex"
206201

207202

208203
@pytest.fixture
209-
def mock_s3_resource(timdex_bucket):
204+
def mocked_timdex_bucket(timdex_bucket):
210205
with moto.mock_aws():
211206
conn = boto3.resource("s3", region_name="us-east-1")
212207
conn.create_bucket(Bucket=timdex_bucket)
213208
yield conn
209+
210+
211+
@pytest.fixture
212+
def timdex_dataset_metadata_empty(dataset_with_runs_location):
213+
return TIMDEXDatasetMetadata(dataset_with_runs_location)
214+
215+
216+
@pytest.fixture
217+
def timdex_dataset_metadata(dataset_with_runs_location):
218+
tdm = TIMDEXDatasetMetadata(dataset_with_runs_location)
219+
tdm.recreate_static_database_file()
220+
return tdm

tests/test_dataset.py

Lines changed: 35 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -137,6 +137,7 @@ def test_dataset_load_with_multi_nonpartition_filters_success(fixed_local_datase
137137
assert fixed_local_dataset.row_count == 1
138138

139139

140+
@pytest.mark.skip(reason="All tests for 'current' records will be reworked.")
140141
def test_dataset_load_current_records_all_sources_success(dataset_with_runs_location):
141142
timdex_dataset = TIMDEXDataset(dataset_with_runs_location)
142143

@@ -149,6 +150,7 @@ def test_dataset_load_current_records_all_sources_success(dataset_with_runs_loca
149150
assert len(timdex_dataset.dataset.files) == 12
150151

151152

153+
@pytest.mark.skip(reason="All tests for 'current' records will be reworked.")
152154
def test_dataset_load_current_records_one_source_success(dataset_with_runs_location):
153155
timdex_dataset = TIMDEXDataset(dataset_with_runs_location)
154156
timdex_dataset.load(current_records=True, source="alma")
@@ -346,9 +348,9 @@ def test_dataset_local_dataset_row_count_missing_dataset_raise_error(local_datas
346348
_ = td.row_count
347349

348350

349-
def test_dataset_all_records_not_current_and_not_deduped(local_dataset_with_runs):
350-
local_dataset_with_runs.load()
351-
all_records_df = local_dataset_with_runs.read_dataframe()
351+
def test_dataset_all_records_not_current_and_not_deduped(dataset_with_runs):
352+
dataset_with_runs.load()
353+
all_records_df = dataset_with_runs.read_dataframe()
352354

353355
# assert counts reflect all records from dataset, no deduping
354356
assert all_records_df.source.value_counts().to_dict() == {"alma": 254, "dspace": 194}
@@ -358,9 +360,10 @@ def test_dataset_all_records_not_current_and_not_deduped(local_dataset_with_runs
358360
assert all_records_df.run_date.max() == date(2025, 2, 5)
359361

360362

361-
def test_dataset_all_current_records_deduped(local_dataset_with_runs):
362-
local_dataset_with_runs.load(current_records=True)
363-
all_records_df = local_dataset_with_runs.read_dataframe()
363+
@pytest.mark.skip(reason="All tests for 'current' records will be reworked.")
364+
def test_dataset_all_current_records_deduped(dataset_with_runs):
365+
dataset_with_runs.load(current_records=True)
366+
all_records_df = dataset_with_runs.read_dataframe()
364367

365368
# assert both sources have accurate record counts for current records only
366369
assert all_records_df.source.value_counts().to_dict() == {"dspace": 90, "alma": 100}
@@ -373,9 +376,10 @@ def test_dataset_all_current_records_deduped(local_dataset_with_runs):
373376
assert all_records_df.run_date.max() == date(2025, 2, 5) # dspace
374377

375378

376-
def test_dataset_source_current_records_deduped(local_dataset_with_runs):
377-
local_dataset_with_runs.load(current_records=True, source="alma")
378-
alma_records_df = local_dataset_with_runs.read_dataframe()
379+
@pytest.mark.skip(reason="All tests for 'current' records will be reworked.")
380+
def test_dataset_source_current_records_deduped(dataset_with_runs):
381+
dataset_with_runs.load(current_records=True, source="alma")
382+
alma_records_df = dataset_with_runs.read_dataframe()
379383

380384
# assert only alma records present and correct count
381385
assert alma_records_df.source.value_counts().to_dict() == {"alma": 100}
@@ -388,36 +392,40 @@ def test_dataset_source_current_records_deduped(local_dataset_with_runs):
388392
assert alma_records_df.run_date.max() == date(2025, 1, 5)
389393

390394

395+
@pytest.mark.skip(reason="All tests for 'current' records will be reworked.")
391396
def test_dataset_all_read_methods_get_deduplication(
392-
local_dataset_with_runs,
397+
dataset_with_runs,
393398
):
394-
local_dataset_with_runs.load(current_records=True, source="alma")
399+
dataset_with_runs.load(current_records=True, source="alma")
395400

396-
full_df = local_dataset_with_runs.read_dataframe()
397-
all_records = list(local_dataset_with_runs.read_dicts_iter())
398-
transformed_records = list(local_dataset_with_runs.read_transformed_records_iter())
401+
full_df = dataset_with_runs.read_dataframe()
402+
all_records = list(dataset_with_runs.read_dicts_iter())
403+
transformed_records = list(dataset_with_runs.read_transformed_records_iter())
399404

400405
assert len(full_df) == len(all_records) == len(transformed_records)
401406

402407

408+
@pytest.mark.skip(reason="All tests for 'current' records will be reworked.")
403409
def test_dataset_current_records_no_additional_filtering_accurate_records_yielded(
404-
local_dataset_with_runs,
410+
dataset_with_runs,
405411
):
406-
local_dataset_with_runs.load(current_records=True, source="alma")
407-
df = local_dataset_with_runs.read_dataframe()
412+
dataset_with_runs.load(current_records=True, source="alma")
413+
df = dataset_with_runs.read_dataframe()
408414
assert df.action.value_counts().to_dict() == {"index": 99, "delete": 1}
409415

410416

417+
@pytest.mark.skip(reason="All tests for 'current' records will be reworked.")
411418
def test_dataset_current_records_action_filtering_accurate_records_yielded(
412-
local_dataset_with_runs,
419+
dataset_with_runs,
413420
):
414-
local_dataset_with_runs.load(current_records=True, source="alma")
415-
df = local_dataset_with_runs.read_dataframe(action="index")
421+
dataset_with_runs.load(current_records=True, source="alma")
422+
df = dataset_with_runs.read_dataframe(action="index")
416423
assert df.action.value_counts().to_dict() == {"index": 99}
417424

418425

426+
@pytest.mark.skip(reason="All tests for 'current' records will be reworked.")
419427
def test_dataset_current_records_index_filtering_accurate_records_yielded(
420-
local_dataset_with_runs,
428+
dataset_with_runs,
421429
):
422430
"""This is a somewhat complex test, but demonstrates that only 'current' records
423431
are yielded when .load(current_records=True) is applied.
@@ -437,14 +445,14 @@ def test_dataset_current_records_index_filtering_accurate_records_yielded(
437445
"influenced" what records we would see as we continue backwards in time.
438446
"""
439447
# with current_records=False, we get all 25 records from run-5
440-
local_dataset_with_runs.load(current_records=False, source="alma")
441-
df = local_dataset_with_runs.read_dataframe(run_id="run-5")
448+
dataset_with_runs.load(current_records=False, source="alma")
449+
df = dataset_with_runs.read_dataframe(run_id="run-5")
442450
assert len(df) == 25
443451

444452
# with current_records=True, we only get 15 records from run-5
445453
# because newer run-6 influenced what records are current for older run-5
446-
local_dataset_with_runs.load(current_records=True, source="alma")
447-
df = local_dataset_with_runs.read_dataframe(run_id="run-5")
454+
dataset_with_runs.load(current_records=True, source="alma")
455+
df = dataset_with_runs.read_dataframe(run_id="run-5")
448456
assert len(df) == 15
449457
assert list(df.timdex_record_id) == [
450458
"alma:10",
@@ -465,6 +473,7 @@ def test_dataset_current_records_index_filtering_accurate_records_yielded(
465473
]
466474

467475

476+
@pytest.mark.skip(reason="All tests for 'current' records will be reworked.")
468477
def test_dataset_load_current_records_gets_correct_same_day_full_run(
469478
dataset_with_same_day_runs,
470479
):
@@ -477,6 +486,7 @@ def test_dataset_load_current_records_gets_correct_same_day_full_run(
477486
assert list(df.run_id.unique()) == ["run-2"]
478487

479488

489+
@pytest.mark.skip(reason="All tests for 'current' records will be reworked.")
480490
def test_dataset_load_current_records_gets_correct_same_day_daily_runs_ordering(
481491
dataset_with_same_day_runs,
482492
):

tests/test_metadata.py

Lines changed: 29 additions & 72 deletions
Original file line numberDiff line numberDiff line change
@@ -1,89 +1,46 @@
1-
# ruff: noqa: PLR2004
1+
from duckdb import DuckDBPyConnection
22

3-
import duckdb
3+
from timdex_dataset_api import TIMDEXDatasetMetadata
44

5-
from timdex_dataset_api import TIMDEXDataset, TIMDEXDatasetMetadata
65

6+
def test_tdm_init_no_metadata_file_warning_success(caplog, dataset_with_runs_location):
7+
tdm = TIMDEXDatasetMetadata(dataset_with_runs_location)
78

8-
def test_tdm_init_from_timdex_dataset_instance_success(dataset_with_same_day_runs):
9-
tdm = TIMDEXDatasetMetadata(timdex_dataset=dataset_with_same_day_runs)
10-
assert isinstance(tdm.timdex_dataset, TIMDEXDataset)
9+
assert tdm.conn is None
10+
assert "Static metadata database not found" in caplog.text
1111

1212

13-
def test_tdm_init_from_timdex_dataset_path_success(dataset_with_runs_location):
14-
tdm = TIMDEXDatasetMetadata.from_dataset_location(dataset_with_runs_location)
15-
assert isinstance(tdm.timdex_dataset, TIMDEXDataset)
13+
def test_tdm_local_dataset_structure_properties():
14+
local_root = "/path/to/nothing"
15+
tdm_local = TIMDEXDatasetMetadata(local_root)
16+
assert tdm_local.location == local_root
17+
assert tdm_local.location_scheme == "file"
1618

1719

18-
def test_tdm_default_database_location_in_memory(timdex_dataset_metadata):
19-
assert timdex_dataset_metadata.db_path == ":memory:"
20-
result = timdex_dataset_metadata.conn.query("PRAGMA database_list;").fetchone()
21-
assert result[1] == "memory" # name of database
22-
assert result[2] is None # file associated with database, where None is memory
20+
def test_tdm_s3_dataset_structure_properties(mocked_timdex_bucket):
21+
s3_root = "s3://timdex/dataset"
22+
tdm_s3 = TIMDEXDatasetMetadata(s3_root)
23+
assert tdm_s3.location == s3_root
24+
assert tdm_s3.location_scheme == "s3"
2325

2426

25-
def test_tdm_explicit_database_in_file(tmp_path, dataset_with_runs_location):
26-
db_path = str(tmp_path / "tda.duckdb")
27-
tdm = TIMDEXDatasetMetadata.from_dataset_location(
28-
dataset_with_runs_location,
29-
db_path=db_path,
30-
)
31-
assert tdm.db_path == db_path
32-
result = tdm.conn.query("PRAGMA database_list;").fetchone()
33-
assert result[1] == "tda" # name of database
34-
assert result[2] == db_path # filepath passed during init
27+
def test_tdm_create_metadata_database_file_success(caplog, timdex_dataset_metadata_empty):
28+
caplog.set_level("DEBUG")
29+
timdex_dataset_metadata_empty.recreate_static_database_file()
3530

3631

37-
def test_tdm_get_duckdb_connection(timdex_dataset_metadata):
38-
conn = timdex_dataset_metadata.get_connection()
39-
assert isinstance(conn, duckdb.DuckDBPyConnection)
32+
def test_tdm_init_metadata_file_found_success(timdex_dataset_metadata):
33+
assert isinstance(timdex_dataset_metadata.conn, DuckDBPyConnection)
4034

4135

42-
def test_tdm_set_threads(timdex_dataset_metadata):
43-
# set to 64
44-
timdex_dataset_metadata.set_database_thread_usage(64)
45-
sixty_four_thread_count = timdex_dataset_metadata.conn.query(
46-
"""SELECT current_setting('threads');"""
47-
).fetchone()[0]
48-
assert sixty_four_thread_count == 64
36+
def test_tdm_connection_has_static_database_attached(timdex_dataset_metadata):
37+
assert set(
38+
timdex_dataset_metadata.conn.query("""show databases;""").to_df().database_name
39+
) == {"memory", "static_db"}
4940

50-
# set to 12
51-
timdex_dataset_metadata.set_database_thread_usage(12)
52-
sixty_four_thread_count = timdex_dataset_metadata.conn.query(
53-
"""SELECT current_setting('threads');"""
54-
).fetchone()[0]
55-
assert sixty_four_thread_count == 12
5641

57-
58-
def test_tdm_init_sets_up_database(timdex_dataset_metadata):
59-
df = timdex_dataset_metadata.conn.query("show tables;").to_df()
60-
assert set(df.name) == {"current_records", "records"}
61-
62-
63-
def test_tdm_get_current_parquet_files(timdex_dataset_metadata):
64-
parquet_files = timdex_dataset_metadata.get_current_parquet_files()
65-
# assert 5 total parquet files in dataset
66-
# but only 3 contain current records
67-
assert len(timdex_dataset_metadata.timdex_dataset.dataset.files) == 5
68-
assert len(parquet_files) == 3
69-
70-
71-
def test_tdm_get_record_to_run_mapping(timdex_dataset_metadata):
72-
record_map = timdex_dataset_metadata.get_current_record_to_run_map()
73-
74-
assert len(record_map) == 75
75-
assert record_map["alma:0"] == "run-5"
76-
assert record_map["alma:5"] == "run-4"
77-
assert record_map["alma:19"] == "run-4"
78-
assert "run-3" not in record_map.values()
79-
assert record_map["alma:20"] == "run-2"
80-
81-
82-
def test_tdm_current_records_subset_of_all_records(timdex_dataset_metadata):
83-
records_df = timdex_dataset_metadata.conn.query("select * from records;").to_df()
84-
current_records_df = timdex_dataset_metadata.conn.query(
85-
"select * from current_records;"
42+
def test_tdm_connection_static_database_records_table_exists(timdex_dataset_metadata):
43+
records_df = timdex_dataset_metadata.conn.query(
44+
"""select * from static_db.records;"""
8645
).to_df()
87-
assert set(current_records_df.timdex_record_id).issubset(
88-
set(records_df.timdex_record_id)
89-
)
46+
assert len(records_df) > 0

tests/test_s3client.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ def test_split_s3_uri_invalid():
4242
client._split_s3_uri("timdex/path/to/file.txt")
4343

4444

45-
def test_upload_download_file(mock_s3_resource, tmp_path):
45+
def test_upload_download_file(mocked_timdex_bucket, tmp_path):
4646
"""Test upload_file and download_file methods."""
4747
client = S3Client()
4848

@@ -62,7 +62,7 @@ def test_upload_download_file(mock_s3_resource, tmp_path):
6262
assert download_path.read_text() == "test content"
6363

6464

65-
def test_delete_file(mock_s3_resource, tmp_path):
65+
def test_delete_file(mocked_timdex_bucket, tmp_path):
6666
"""Test delete_file method."""
6767
client = S3Client()
6868

@@ -76,12 +76,12 @@ def test_delete_file(mock_s3_resource, tmp_path):
7676
client.delete_file(s3_uri)
7777

7878
# Verify the file is deleted
79-
bucket = mock_s3_resource.Bucket("timdex")
79+
bucket = mocked_timdex_bucket.Bucket("timdex")
8080
objects = list(bucket.objects.all())
8181
assert len(objects) == 0
8282

8383

84-
def test_delete_folder(mock_s3_resource, tmp_path):
84+
def test_delete_folder(mocked_timdex_bucket, tmp_path):
8585
"""Test delete_folder method."""
8686
client = S3Client()
8787

@@ -104,7 +104,7 @@ def test_delete_folder(mock_s3_resource, tmp_path):
104104
assert len(deleted_keys) == 3
105105
assert all(key.startswith("folder/") for key in deleted_keys)
106106

107-
bucket = mock_s3_resource.Bucket("timdex")
107+
bucket = mocked_timdex_bucket.Bucket("timdex")
108108
objects = list(bucket.objects.all())
109109
assert len(objects) == 1
110110
assert objects[0].key == "other.txt"

tests/test_write.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,9 @@ def test_dataset_write_record_batches_uses_batch_size(
5252
)
5353

5454

55+
@pytest.mark.skip(
56+
reason="Test unneeded soon when list[str] not supported for dataset location."
57+
)
5558
def test_dataset_write_to_multiple_locations_raise_error(sample_records_iter):
5659
timdex_dataset = TIMDEXDataset(
5760
location=["/path/to/records-1.parquet", "/path/to/records-2.parquet"]

0 commit comments

Comments
 (0)