Skip to content

Commit 3f90fc6

Browse files
committed
Create TIMDEXDatasetMetadata class
Why these changes are being introduced: As outlined in more detail in TIMX-506, this class is a new, alternate approach to providing high level metadata about the dataset to better support bulk operations. This class is designed to remove the need for TIMDEXRunManager, while also providing more helpful and granular data about the dataset. Ultimately this will reduce code complexity by relying less on implicit data contracts about the datasaet and more on verifiable, data-in-hand about the dataset that can be used for bulk operations (e.g. yielding current records). How this addresses that need: * New class TIMDEXDatasetMetadata is created * uses DuckDB to quickly crawl the dataset, generating an in-memory or on-disk database of record metadata * This class is not yet wired into any TIMDEXDataset operations, this will happen in future work Side effects of this change: * None Relevant ticket(s): * https://mitlibraries.atlassian.net/browse/TIMX-506
1 parent 4235515 commit 3f90fc6

4 files changed

Lines changed: 391 additions & 1 deletion

File tree

tests/conftest.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
generate_sample_records,
1111
generate_sample_records_with_simulated_partitions,
1212
)
13-
from timdex_dataset_api import TIMDEXDataset
13+
from timdex_dataset_api import TIMDEXDataset, TIMDEXDatasetMetadata
1414
from timdex_dataset_api.dataset import TIMDEXDatasetConfig
1515

1616

@@ -208,3 +208,8 @@ def dataset_with_same_day_runs(tmp_path) -> TIMDEXDataset:
208208
timdex_dataset.load()
209209

210210
return timdex_dataset
211+
212+
213+
@pytest.fixture
214+
def timdex_dataset_metadata(dataset_with_same_day_runs):
215+
return TIMDEXDatasetMetadata(timdex_dataset=dataset_with_same_day_runs)

tests/test_metadata.py

Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,89 @@
1+
# ruff: noqa: PLR2004
2+
3+
import duckdb
4+
5+
from timdex_dataset_api import TIMDEXDataset, TIMDEXDatasetMetadata
6+
7+
8+
def test_tdm_init_from_timdex_dataset_instance_success(dataset_with_same_day_runs):
9+
tdm = TIMDEXDatasetMetadata(timdex_dataset=dataset_with_same_day_runs)
10+
assert isinstance(tdm.timdex_dataset, TIMDEXDataset)
11+
12+
13+
def test_tdm_init_from_timdex_dataset_path_success(dataset_with_runs_location):
14+
tdm = TIMDEXDatasetMetadata.from_dataset_location(dataset_with_runs_location)
15+
assert isinstance(tdm.timdex_dataset, TIMDEXDataset)
16+
17+
18+
def test_tdm_default_database_location_in_memory(timdex_dataset_metadata):
19+
assert timdex_dataset_metadata.db_path == ":memory:"
20+
result = timdex_dataset_metadata.conn.query("PRAGMA database_list;").fetchone()
21+
assert result[1] == "memory" # name of database
22+
assert result[2] is None # file associated with database, where None is memory
23+
24+
25+
def test_tdm_explicit_database_in_file(tmp_path, dataset_with_runs_location):
26+
db_path = str(tmp_path / "tda.duckdb")
27+
tdm = TIMDEXDatasetMetadata.from_dataset_location(
28+
dataset_with_runs_location,
29+
db_path=db_path,
30+
)
31+
assert tdm.db_path == db_path
32+
result = tdm.conn.query("PRAGMA database_list;").fetchone()
33+
assert result[1] == "tda" # name of database
34+
assert result[2] == db_path # filepath passed during init
35+
36+
37+
def test_tdm_get_duckdb_connection(timdex_dataset_metadata):
38+
conn = timdex_dataset_metadata.get_connection()
39+
assert isinstance(conn, duckdb.DuckDBPyConnection)
40+
41+
42+
def test_tdm_set_threads(timdex_dataset_metadata):
43+
# set to 64
44+
timdex_dataset_metadata.set_database_thread_usage(64)
45+
sixty_four_thread_count = timdex_dataset_metadata.conn.query(
46+
"""SELECT current_setting('threads');"""
47+
).fetchone()[0]
48+
assert sixty_four_thread_count == 64
49+
50+
# set to 12
51+
timdex_dataset_metadata.set_database_thread_usage(12)
52+
sixty_four_thread_count = timdex_dataset_metadata.conn.query(
53+
"""SELECT current_setting('threads');"""
54+
).fetchone()[0]
55+
assert sixty_four_thread_count == 12
56+
57+
58+
def test_tdm_init_sets_up_database(timdex_dataset_metadata):
59+
df = timdex_dataset_metadata.conn.query("show tables;").to_df()
60+
assert set(df.name) == {"current_records", "records"}
61+
62+
63+
def test_tdm_get_current_parquet_files(timdex_dataset_metadata):
64+
parquet_files = timdex_dataset_metadata.get_current_parquet_files()
65+
# assert 5 total parquet files in dataset
66+
# but only 3 contain current records
67+
assert len(timdex_dataset_metadata.timdex_dataset.dataset.files) == 5
68+
assert len(parquet_files) == 3
69+
70+
71+
def test_tdm_get_record_to_run_mapping(timdex_dataset_metadata):
72+
record_map = timdex_dataset_metadata.get_current_record_to_run_map()
73+
74+
assert len(record_map) == 75
75+
assert record_map["alma:0"] == "run-5"
76+
assert record_map["alma:5"] == "run-4"
77+
assert record_map["alma:19"] == "run-4"
78+
assert "run-3" not in record_map.values()
79+
assert record_map["alma:20"] == "run-2"
80+
81+
82+
def test_tdm_current_records_subset_of_all_records(timdex_dataset_metadata):
83+
records_df = timdex_dataset_metadata.conn.query("select * from records;").to_df()
84+
current_records_df = timdex_dataset_metadata.conn.query(
85+
"select * from current_records;"
86+
).to_df()
87+
assert set(current_records_df.timdex_record_id).issubset(
88+
set(records_df.timdex_record_id)
89+
)

timdex_dataset_api/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,13 @@
11
"""timdex_dataset_api/__init__.py"""
22

33
from timdex_dataset_api.dataset import TIMDEXDataset
4+
from timdex_dataset_api.metadata import TIMDEXDatasetMetadata
45
from timdex_dataset_api.record import DatasetRecord
56

67
__version__ = "2.1.0"
78

89
__all__ = [
910
"DatasetRecord",
1011
"TIMDEXDataset",
12+
"TIMDEXDatasetMetadata",
1113
]

0 commit comments

Comments
 (0)