Skip to content

Commit 7f7800d

Browse files
committed
Begin rebuilding of data and metadata tests
Why these changes are being introduced: With the big changes to TIMDEXMetadataDataset comes the need to virtually rewrite the test suite for that class. The changes too TIMDEXMetadataDataset are also influencing tests for TIMDEXDataset, both how its loaded and tested for 'current' record reading. How this addresses that need: This begins with some basic tests around the loading, creating, and attaching of a static database file for TIMDEXMetadataDataset. Future tests will more fully exercise the final views and tables created. This commit also *temporarily* skips a bunch of tests for TIMDEXDataset that will not pass until the ability to limit to 'current' records is reinsated with the updated TIMDEXMetadataDataset. Side effects of this change: * Test suite passes, but multiple tests are temporarily skipped. Relevant ticket(s): * https://mitlibraries.atlassian.net/browse/TIMX-530
1 parent 37c9275 commit 7f7800d

6 files changed

Lines changed: 107 additions & 73 deletions

File tree

tests/conftest.py

Lines changed: 14 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -137,7 +137,7 @@ def dataset_with_runs_location(tmp_path) -> str:
137137

138138

139139
@pytest.fixture
140-
def local_dataset_with_runs(dataset_with_runs_location) -> TIMDEXDataset:
140+
def dataset_with_runs(dataset_with_runs_location) -> TIMDEXDataset:
141141
return TIMDEXDataset(dataset_with_runs_location)
142142

143143

@@ -195,19 +195,26 @@ def dataset_with_same_day_runs(tmp_path) -> TIMDEXDataset:
195195
return timdex_dataset
196196

197197

198-
@pytest.fixture
199-
def timdex_dataset_metadata(dataset_with_same_day_runs):
200-
return TIMDEXDatasetMetadata(timdex_dataset=dataset_with_same_day_runs)
201-
202-
203198
@pytest.fixture
204199
def timdex_bucket():
205200
return "timdex"
206201

207202

208203
@pytest.fixture
209-
def mock_s3_resource(timdex_bucket):
204+
def mocked_timdex_bucket(timdex_bucket):
210205
with moto.mock_aws():
211206
conn = boto3.resource("s3", region_name="us-east-1")
212207
conn.create_bucket(Bucket=timdex_bucket)
213208
yield conn
209+
210+
211+
@pytest.fixture
212+
def timdex_dataset_metadata_empty(dataset_with_runs_location):
213+
return TIMDEXDatasetMetadata(dataset_with_runs_location)
214+
215+
216+
@pytest.fixture
217+
def timdex_dataset_metadata(dataset_with_runs_location):
218+
tdm = TIMDEXDatasetMetadata(dataset_with_runs_location)
219+
tdm.recreate_static_database_file()
220+
return tdm

tests/test_dataset.py

Lines changed: 35 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -137,6 +137,7 @@ def test_dataset_load_with_multi_nonpartition_filters_success(fixed_local_datase
137137
assert fixed_local_dataset.row_count == 1
138138

139139

140+
@pytest.mark.skip(reason="All tests for 'current' records will be reworked.")
140141
def test_dataset_load_current_records_all_sources_success(dataset_with_runs_location):
141142
timdex_dataset = TIMDEXDataset(dataset_with_runs_location)
142143

@@ -149,6 +150,7 @@ def test_dataset_load_current_records_all_sources_success(dataset_with_runs_loca
149150
assert len(timdex_dataset.dataset.files) == 12
150151

151152

153+
@pytest.mark.skip(reason="All tests for 'current' records will be reworked.")
152154
def test_dataset_load_current_records_one_source_success(dataset_with_runs_location):
153155
timdex_dataset = TIMDEXDataset(dataset_with_runs_location)
154156
timdex_dataset.load(current_records=True, source="alma")
@@ -346,9 +348,9 @@ def test_dataset_local_dataset_row_count_missing_dataset_raise_error(local_datas
346348
_ = td.row_count
347349

348350

349-
def test_dataset_all_records_not_current_and_not_deduped(local_dataset_with_runs):
350-
local_dataset_with_runs.load()
351-
all_records_df = local_dataset_with_runs.read_dataframe()
351+
def test_dataset_all_records_not_current_and_not_deduped(dataset_with_runs):
352+
dataset_with_runs.load()
353+
all_records_df = dataset_with_runs.read_dataframe()
352354

353355
# assert counts reflect all records from dataset, no deduping
354356
assert all_records_df.source.value_counts().to_dict() == {"alma": 254, "dspace": 194}
@@ -358,9 +360,10 @@ def test_dataset_all_records_not_current_and_not_deduped(local_dataset_with_runs
358360
assert all_records_df.run_date.max() == date(2025, 2, 5)
359361

360362

361-
def test_dataset_all_current_records_deduped(local_dataset_with_runs):
362-
local_dataset_with_runs.load(current_records=True)
363-
all_records_df = local_dataset_with_runs.read_dataframe()
363+
@pytest.mark.skip(reason="All tests for 'current' records will be reworked.")
364+
def test_dataset_all_current_records_deduped(dataset_with_runs):
365+
dataset_with_runs.load(current_records=True)
366+
all_records_df = dataset_with_runs.read_dataframe()
364367

365368
# assert both sources have accurate record counts for current records only
366369
assert all_records_df.source.value_counts().to_dict() == {"dspace": 90, "alma": 100}
@@ -373,9 +376,10 @@ def test_dataset_all_current_records_deduped(local_dataset_with_runs):
373376
assert all_records_df.run_date.max() == date(2025, 2, 5) # dspace
374377

375378

376-
def test_dataset_source_current_records_deduped(local_dataset_with_runs):
377-
local_dataset_with_runs.load(current_records=True, source="alma")
378-
alma_records_df = local_dataset_with_runs.read_dataframe()
379+
@pytest.mark.skip(reason="All tests for 'current' records will be reworked.")
380+
def test_dataset_source_current_records_deduped(dataset_with_runs):
381+
dataset_with_runs.load(current_records=True, source="alma")
382+
alma_records_df = dataset_with_runs.read_dataframe()
379383

380384
# assert only alma records present and correct count
381385
assert alma_records_df.source.value_counts().to_dict() == {"alma": 100}
@@ -388,36 +392,40 @@ def test_dataset_source_current_records_deduped(local_dataset_with_runs):
388392
assert alma_records_df.run_date.max() == date(2025, 1, 5)
389393

390394

395+
@pytest.mark.skip(reason="All tests for 'current' records will be reworked.")
391396
def test_dataset_all_read_methods_get_deduplication(
392-
local_dataset_with_runs,
397+
dataset_with_runs,
393398
):
394-
local_dataset_with_runs.load(current_records=True, source="alma")
399+
dataset_with_runs.load(current_records=True, source="alma")
395400

396-
full_df = local_dataset_with_runs.read_dataframe()
397-
all_records = list(local_dataset_with_runs.read_dicts_iter())
398-
transformed_records = list(local_dataset_with_runs.read_transformed_records_iter())
401+
full_df = dataset_with_runs.read_dataframe()
402+
all_records = list(dataset_with_runs.read_dicts_iter())
403+
transformed_records = list(dataset_with_runs.read_transformed_records_iter())
399404

400405
assert len(full_df) == len(all_records) == len(transformed_records)
401406

402407

408+
@pytest.mark.skip(reason="All tests for 'current' records will be reworked.")
403409
def test_dataset_current_records_no_additional_filtering_accurate_records_yielded(
404-
local_dataset_with_runs,
410+
dataset_with_runs,
405411
):
406-
local_dataset_with_runs.load(current_records=True, source="alma")
407-
df = local_dataset_with_runs.read_dataframe()
412+
dataset_with_runs.load(current_records=True, source="alma")
413+
df = dataset_with_runs.read_dataframe()
408414
assert df.action.value_counts().to_dict() == {"index": 99, "delete": 1}
409415

410416

417+
@pytest.mark.skip(reason="All tests for 'current' records will be reworked.")
411418
def test_dataset_current_records_action_filtering_accurate_records_yielded(
412-
local_dataset_with_runs,
419+
dataset_with_runs,
413420
):
414-
local_dataset_with_runs.load(current_records=True, source="alma")
415-
df = local_dataset_with_runs.read_dataframe(action="index")
421+
dataset_with_runs.load(current_records=True, source="alma")
422+
df = dataset_with_runs.read_dataframe(action="index")
416423
assert df.action.value_counts().to_dict() == {"index": 99}
417424

418425

426+
@pytest.mark.skip(reason="All tests for 'current' records will be reworked.")
419427
def test_dataset_current_records_index_filtering_accurate_records_yielded(
420-
local_dataset_with_runs,
428+
dataset_with_runs,
421429
):
422430
"""This is a somewhat complex test, but demonstrates that only 'current' records
423431
are yielded when .load(current_records=True) is applied.
@@ -437,14 +445,14 @@ def test_dataset_current_records_index_filtering_accurate_records_yielded(
437445
"influenced" what records we would see as we continue backwards in time.
438446
"""
439447
# with current_records=False, we get all 25 records from run-5
440-
local_dataset_with_runs.load(current_records=False, source="alma")
441-
df = local_dataset_with_runs.read_dataframe(run_id="run-5")
448+
dataset_with_runs.load(current_records=False, source="alma")
449+
df = dataset_with_runs.read_dataframe(run_id="run-5")
442450
assert len(df) == 25
443451

444452
# with current_records=True, we only get 15 records from run-5
445453
# because newer run-6 influenced what records are current for older run-5
446-
local_dataset_with_runs.load(current_records=True, source="alma")
447-
df = local_dataset_with_runs.read_dataframe(run_id="run-5")
454+
dataset_with_runs.load(current_records=True, source="alma")
455+
df = dataset_with_runs.read_dataframe(run_id="run-5")
448456
assert len(df) == 15
449457
assert list(df.timdex_record_id) == [
450458
"alma:10",
@@ -465,6 +473,7 @@ def test_dataset_current_records_index_filtering_accurate_records_yielded(
465473
]
466474

467475

476+
@pytest.mark.skip(reason="All tests for 'current' records will be reworked.")
468477
def test_dataset_load_current_records_gets_correct_same_day_full_run(
469478
dataset_with_same_day_runs,
470479
):
@@ -477,6 +486,7 @@ def test_dataset_load_current_records_gets_correct_same_day_full_run(
477486
assert list(df.run_id.unique()) == ["run-2"]
478487

479488

489+
@pytest.mark.skip(reason="All tests for 'current' records will be reworked.")
480490
def test_dataset_load_current_records_gets_correct_same_day_daily_runs_ordering(
481491
dataset_with_same_day_runs,
482492
):

tests/test_metadata.py

Lines changed: 21 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -1,42 +1,36 @@
1-
# ruff: noqa: PLR2004
1+
from duckdb import DuckDBPyConnection
22

3-
import duckdb
3+
from timdex_dataset_api import TIMDEXDatasetMetadata
44

5-
from timdex_dataset_api import TIMDEXDataset, TIMDEXDatasetMetadata
65

6+
def test_tdm_init_no_metadata_file_warning_success(caplog, dataset_with_runs_location):
7+
tdm = TIMDEXDatasetMetadata(dataset_with_runs_location)
78

8-
def test_tdm_init_from_timdex_dataset_instance_success(dataset_with_same_day_runs):
9-
tdm = TIMDEXDatasetMetadata(timdex_dataset=dataset_with_same_day_runs)
10-
assert isinstance(tdm.timdex_dataset, TIMDEXDataset)
9+
assert tdm.conn is None
10+
assert "Static metadata database not found" in caplog.text
1111

1212

13-
def test_tdm_init_from_timdex_dataset_path_success(dataset_with_runs_location):
14-
tdm = TIMDEXDatasetMetadata.from_dataset_location(dataset_with_runs_location)
15-
assert isinstance(tdm.timdex_dataset, TIMDEXDataset)
13+
def test_tdm_local_dataset_structure_properties():
14+
local_root = "/path/to/nothing"
15+
tdm_local = TIMDEXDatasetMetadata(local_root)
16+
assert tdm_local.location == local_root
17+
assert tdm_local.location_scheme == "file"
1618

1719

18-
def test_tdm_default_database_location_in_memory(timdex_dataset_metadata):
19-
assert timdex_dataset_metadata.db_path == ":memory:"
20-
result = timdex_dataset_metadata.conn.query("PRAGMA database_list;").fetchone()
21-
assert result[1] == "memory" # name of database
22-
assert result[2] is None # file associated with database, where None is memory
20+
def test_tdm_s3_dataset_structure_properties(mocked_timdex_bucket):
21+
s3_root = "s3://timdex/dataset"
22+
tdm_s3 = TIMDEXDatasetMetadata(s3_root)
23+
assert tdm_s3.location == s3_root
24+
assert tdm_s3.location_scheme == "s3"
2325

2426

25-
def test_tdm_explicit_database_in_file(tmp_path, dataset_with_runs_location):
26-
db_path = str(tmp_path / "tda.duckdb")
27-
tdm = TIMDEXDatasetMetadata.from_dataset_location(
28-
dataset_with_runs_location,
29-
db_path=db_path,
30-
)
31-
assert tdm.db_path == db_path
32-
result = tdm.conn.query("PRAGMA database_list;").fetchone()
33-
assert result[1] == "tda" # name of database
34-
assert result[2] == db_path # filepath passed during init
27+
def test_tdm_create_metadata_database_file_success(caplog, timdex_dataset_metadata_empty):
28+
caplog.set_level("DEBUG")
29+
timdex_dataset_metadata_empty.recreate_static_database_file()
3530

3631

37-
def test_tdm_get_duckdb_connection(timdex_dataset_metadata):
38-
conn = timdex_dataset_metadata.get_connection()
39-
assert isinstance(conn, duckdb.DuckDBPyConnection)
32+
def test_tdm_init_metadata_file_found_success(timdex_dataset_metadata):
33+
assert isinstance(timdex_dataset_metadata.conn, DuckDBPyConnection)
4034

4135

4236
def test_tdm_connection_has_static_database_attached(timdex_dataset_metadata):

tests/test_s3client.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ def test_split_s3_uri_invalid():
4242
client._split_s3_uri("timdex/path/to/file.txt")
4343

4444

45-
def test_upload_download_file(mock_s3_resource, tmp_path):
45+
def test_upload_download_file(mocked_timdex_bucket, tmp_path):
4646
"""Test upload_file and download_file methods."""
4747
client = S3Client()
4848

@@ -62,7 +62,7 @@ def test_upload_download_file(mock_s3_resource, tmp_path):
6262
assert download_path.read_text() == "test content"
6363

6464

65-
def test_delete_file(mock_s3_resource, tmp_path):
65+
def test_delete_file(mocked_timdex_bucket, tmp_path):
6666
"""Test delete_file method."""
6767
client = S3Client()
6868

@@ -76,12 +76,12 @@ def test_delete_file(mock_s3_resource, tmp_path):
7676
client.delete_file(s3_uri)
7777

7878
# Verify the file is deleted
79-
bucket = mock_s3_resource.Bucket("timdex")
79+
bucket = mocked_timdex_bucket.Bucket("timdex")
8080
objects = list(bucket.objects.all())
8181
assert len(objects) == 0
8282

8383

84-
def test_delete_folder(mock_s3_resource, tmp_path):
84+
def test_delete_folder(mocked_timdex_bucket, tmp_path):
8585
"""Test delete_folder method."""
8686
client = S3Client()
8787

@@ -104,7 +104,7 @@ def test_delete_folder(mock_s3_resource, tmp_path):
104104
assert len(deleted_keys) == 3
105105
assert all(key.startswith("folder/") for key in deleted_keys)
106106

107-
bucket = mock_s3_resource.Bucket("timdex")
107+
bucket = mocked_timdex_bucket.Bucket("timdex")
108108
objects = list(bucket.objects.all())
109109
assert len(objects) == 1
110110
assert objects[0].key == "other.txt"

tests/test_write.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,9 @@ def test_dataset_write_record_batches_uses_batch_size(
5252
)
5353

5454

55+
@pytest.mark.skip(
56+
reason="Test unneeded soon when list[str] not supported for dataset location."
57+
)
5558
def test_dataset_write_to_multiple_locations_raise_error(sample_records_iter):
5659
timdex_dataset = TIMDEXDataset(
5760
location=["/path/to/records-1.parquet", "/path/to/records-2.parquet"]

timdex_dataset_api/metadata.py

Lines changed: 29 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,11 @@
11
"""timdex_dataset_api/metadata.py"""
22

33
import os
4+
import shutil
45
import tempfile
56
import time
67
from pathlib import Path
8+
from typing import Literal
79
from urllib.parse import urlparse
810

911
import duckdb
@@ -41,6 +43,15 @@ def __init__(
4143
self.location = location
4244
self.conn: None | DuckDBPyConnection = self.setup_duckdb_context()
4345

46+
@property
47+
def location_scheme(self) -> Literal["file", "s3"]:
48+
scheme = urlparse(self.location).scheme
49+
if scheme == "":
50+
return "file"
51+
if scheme == "s3":
52+
return "s3"
53+
raise ValueError(f"Location with scheme type '{scheme}' not supported.")
54+
4455
@property
4556
def metadata_root(self) -> str:
4657
return f"{self.location.removesuffix('/')}/metadata"
@@ -59,7 +70,7 @@ def append_deltas_path(self) -> str:
5970

6071
def database_exists(self) -> bool:
6172
"""Check if static metadata database file exists."""
62-
if urlparse(self.metadata_database_path).scheme == "s3":
73+
if self.location_scheme == "s3":
6374
s3_client = S3Client()
6475
return s3_client.object_exists(self.metadata_database_path)
6576
return os.path.exists(self.metadata_database_path)
@@ -75,10 +86,11 @@ def recreate_static_database_file(self) -> None:
7586
5. Upload DuckDB database file to target destination, making that the new
7687
static metadata database file
7788
"""
78-
s3_client = S3Client()
79-
80-
# remove any append deltas that may exist at this time of database recreation
81-
s3_client.delete_folder(self.append_deltas_path)
89+
if self.location_scheme == "s3":
90+
s3_client = S3Client()
91+
s3_client.delete_folder(self.append_deltas_path)
92+
else:
93+
shutil.rmtree(self.append_deltas_path, ignore_errors=True)
8294

8395
# build database locally
8496
with tempfile.TemporaryDirectory() as temp_dir:
@@ -91,10 +103,18 @@ def recreate_static_database_file(self) -> None:
91103
self._create_full_dataset_table(conn)
92104

93105
# copy local database file to remote location
94-
s3_client.upload_file(
95-
local_db_path,
96-
self.metadata_database_path,
97-
)
106+
if self.location_scheme == "s3":
107+
s3_client = S3Client()
108+
s3_client.upload_file(
109+
local_db_path,
110+
self.metadata_database_path,
111+
)
112+
else:
113+
Path(self.metadata_database_path).parent.mkdir(
114+
parents=True,
115+
exist_ok=True,
116+
)
117+
shutil.copy(local_db_path, self.metadata_database_path)
98118

99119
# refresh DuckDB connection
100120
self.conn = self.setup_duckdb_context()

0 commit comments

Comments
 (0)