Create TIMDEXDatasetMetadata class

ghukill · ghukill · commit 3f90fc69ac4e · 2025-06-24T14:39:38.000-04:00
Why these changes are being introduced: As outlined in more detail in TIMX-506, this class is a new, alternate approach to providing high level metadata about the dataset to better support bulk operations. This class is designed to remove the need for TIMDEXRunManager, while also providing more helpful and granular data about the dataset. Ultimately this will reduce code complexity by relying less on implicit data contracts about the datasaet and more on verifiable, data-in-hand about the dataset that can be used for bulk operations (e.g. yielding current records). How this addresses that need: * New class TIMDEXDatasetMetadata is created * uses DuckDB to quickly crawl the dataset, generating an in-memory or on-disk database of record metadata * This class is not yet wired into any TIMDEXDataset operations, this will happen in future work Side effects of this change: * None Relevant ticket(s): * https://mitlibraries.atlassian.net/browse/TIMX-506
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -10,7 +10,7 @@
     generate_sample_records,
     generate_sample_records_with_simulated_partitions,
 )
-from timdex_dataset_api import TIMDEXDataset
+from timdex_dataset_api import TIMDEXDataset, TIMDEXDatasetMetadata
 from timdex_dataset_api.dataset import TIMDEXDatasetConfig
 
 
@@ -208,3 +208,8 @@ def dataset_with_same_day_runs(tmp_path) -> TIMDEXDataset:
     timdex_dataset.load()
 
     return timdex_dataset
+
+
+@pytest.fixture
+def timdex_dataset_metadata(dataset_with_same_day_runs):
+    return TIMDEXDatasetMetadata(timdex_dataset=dataset_with_same_day_runs)
diff --git a/tests/test_metadata.py b/tests/test_metadata.py
@@ -0,0 +1,89 @@
+# ruff: noqa: PLR2004
+
+import duckdb
+
+from timdex_dataset_api import TIMDEXDataset, TIMDEXDatasetMetadata
+
+
+def test_tdm_init_from_timdex_dataset_instance_success(dataset_with_same_day_runs):
+    tdm = TIMDEXDatasetMetadata(timdex_dataset=dataset_with_same_day_runs)
+    assert isinstance(tdm.timdex_dataset, TIMDEXDataset)
+
+
+def test_tdm_init_from_timdex_dataset_path_success(dataset_with_runs_location):
+    tdm = TIMDEXDatasetMetadata.from_dataset_location(dataset_with_runs_location)
+    assert isinstance(tdm.timdex_dataset, TIMDEXDataset)
+
+
+def test_tdm_default_database_location_in_memory(timdex_dataset_metadata):
+    assert timdex_dataset_metadata.db_path == ":memory:"
+    result = timdex_dataset_metadata.conn.query("PRAGMA database_list;").fetchone()
+    assert result[1] == "memory"  # name of database
+    assert result[2] is None  # file associated with database, where None is memory
+
+
+def test_tdm_explicit_database_in_file(tmp_path, dataset_with_runs_location):
+    db_path = str(tmp_path / "tda.duckdb")
+    tdm = TIMDEXDatasetMetadata.from_dataset_location(
+        dataset_with_runs_location,
+        db_path=db_path,
+    )
+    assert tdm.db_path == db_path
+    result = tdm.conn.query("PRAGMA database_list;").fetchone()
+    assert result[1] == "tda"  # name of database
+    assert result[2] == db_path  # filepath passed during init
+
+
+def test_tdm_get_duckdb_connection(timdex_dataset_metadata):
+    conn = timdex_dataset_metadata.get_connection()
+    assert isinstance(conn, duckdb.DuckDBPyConnection)
+
+
+def test_tdm_set_threads(timdex_dataset_metadata):
+    # set to 64
+    timdex_dataset_metadata.set_database_thread_usage(64)
+    sixty_four_thread_count = timdex_dataset_metadata.conn.query(
+        """SELECT current_setting('threads');"""
+    ).fetchone()[0]
+    assert sixty_four_thread_count == 64
+
+    # set to 12
+    timdex_dataset_metadata.set_database_thread_usage(12)
+    sixty_four_thread_count = timdex_dataset_metadata.conn.query(
+        """SELECT current_setting('threads');"""
+    ).fetchone()[0]
+    assert sixty_four_thread_count == 12
+
+
+def test_tdm_init_sets_up_database(timdex_dataset_metadata):
+    df = timdex_dataset_metadata.conn.query("show tables;").to_df()
+    assert set(df.name) == {"current_records", "records"}
+
+
+def test_tdm_get_current_parquet_files(timdex_dataset_metadata):
+    parquet_files = timdex_dataset_metadata.get_current_parquet_files()
+    # assert 5 total parquet files in dataset
+    # but only 3 contain current records
+    assert len(timdex_dataset_metadata.timdex_dataset.dataset.files) == 5
+    assert len(parquet_files) == 3
+
+
+def test_tdm_get_record_to_run_mapping(timdex_dataset_metadata):
+    record_map = timdex_dataset_metadata.get_current_record_to_run_map()
+
+    assert len(record_map) == 75
+    assert record_map["alma:0"] == "run-5"
+    assert record_map["alma:5"] == "run-4"
+    assert record_map["alma:19"] == "run-4"
+    assert "run-3" not in record_map.values()
+    assert record_map["alma:20"] == "run-2"
+
+
+def test_tdm_current_records_subset_of_all_records(timdex_dataset_metadata):
+    records_df = timdex_dataset_metadata.conn.query("select * from records;").to_df()
+    current_records_df = timdex_dataset_metadata.conn.query(
+        "select * from current_records;"
+    ).to_df()
+    assert set(current_records_df.timdex_record_id).issubset(
+        set(records_df.timdex_record_id)
+    )
diff --git a/timdex_dataset_api/__init__.py b/timdex_dataset_api/__init__.py
@@ -1,11 +1,13 @@
 """timdex_dataset_api/__init__.py"""
 
 from timdex_dataset_api.dataset import TIMDEXDataset
+from timdex_dataset_api.metadata import TIMDEXDatasetMetadata
 from timdex_dataset_api.record import DatasetRecord
 
 __version__ = "2.1.0"
 
 __all__ = [
     "DatasetRecord",
     "TIMDEXDataset",
+    "TIMDEXDatasetMetadata",
 ]
diff --git a/timdex_dataset_api/metadata.py b/timdex_dataset_api/metadata.py