Merge pull request #157 from MITLibraries/TIMX-530-create-static-metadata-db-file

ghukill · web-flow · commit 3efe8b7ae64d · 2025-08-06T13:57:06.000-04:00
TIMX 530 - rebuild TIMDEXDatasetMetadata with static database file approach
diff --git a/migrations/002_2025_06_25_consistent_run_timestamp_per_etl_run.py b/migrations/002_2025_06_25_consistent_run_timestamp_per_etl_run.py
@@ -1,4 +1,7 @@
-# ruff: noqa: BLE001, D212, TRY300, TRY400
+# ruff: noqa: PGH004
+# ruff: noqa
+# type: ignore
+
 """
 Date: 2025-06-25
 
@@ -29,6 +32,10 @@
 pipenv run python migrations/002_2025_06_25_consistent_run_timestamp_per_etl_run.py \
 <DATASET_LOCATION> \
 --dry-run
+
+Update: 2025-08-04
+
+This migration is no longer functional given changes to TIMDEXDataset.
 """
 
 import argparse
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -137,7 +137,7 @@ def dataset_with_runs_location(tmp_path) -> str:
 
 
 @pytest.fixture
-def local_dataset_with_runs(dataset_with_runs_location) -> TIMDEXDataset:
+def dataset_with_runs(dataset_with_runs_location) -> TIMDEXDataset:
     return TIMDEXDataset(dataset_with_runs_location)
 
 
@@ -195,19 +195,26 @@ def dataset_with_same_day_runs(tmp_path) -> TIMDEXDataset:
     return timdex_dataset
 
 
-@pytest.fixture
-def timdex_dataset_metadata(dataset_with_same_day_runs):
-    return TIMDEXDatasetMetadata(timdex_dataset=dataset_with_same_day_runs)
-
-
 @pytest.fixture
 def timdex_bucket():
     return "timdex"
 
 
 @pytest.fixture
-def mock_s3_resource(timdex_bucket):
+def mocked_timdex_bucket(timdex_bucket):
     with moto.mock_aws():
         conn = boto3.resource("s3", region_name="us-east-1")
         conn.create_bucket(Bucket=timdex_bucket)
         yield conn
+
+
+@pytest.fixture
+def timdex_dataset_metadata_empty(dataset_with_runs_location):
+    return TIMDEXDatasetMetadata(dataset_with_runs_location)
+
+
+@pytest.fixture
+def timdex_dataset_metadata(dataset_with_runs_location):
+    tdm = TIMDEXDatasetMetadata(dataset_with_runs_location)
+    tdm.recreate_static_database_file()
+    return tdm
diff --git a/tests/test_dataset.py b/tests/test_dataset.py
@@ -137,6 +137,7 @@ def test_dataset_load_with_multi_nonpartition_filters_success(fixed_local_datase
     assert fixed_local_dataset.row_count == 1
 
 
+@pytest.mark.skip(reason="All tests for 'current' records will be reworked.")
 def test_dataset_load_current_records_all_sources_success(dataset_with_runs_location):
     timdex_dataset = TIMDEXDataset(dataset_with_runs_location)
 
@@ -149,6 +150,7 @@ def test_dataset_load_current_records_all_sources_success(dataset_with_runs_loca
     assert len(timdex_dataset.dataset.files) == 12
 
 
+@pytest.mark.skip(reason="All tests for 'current' records will be reworked.")
 def test_dataset_load_current_records_one_source_success(dataset_with_runs_location):
     timdex_dataset = TIMDEXDataset(dataset_with_runs_location)
     timdex_dataset.load(current_records=True, source="alma")
@@ -346,9 +348,9 @@ def test_dataset_local_dataset_row_count_missing_dataset_raise_error(local_datas
         _ = td.row_count
 
 
-def test_dataset_all_records_not_current_and_not_deduped(local_dataset_with_runs):
-    local_dataset_with_runs.load()
-    all_records_df = local_dataset_with_runs.read_dataframe()
+def test_dataset_all_records_not_current_and_not_deduped(dataset_with_runs):
+    dataset_with_runs.load()
+    all_records_df = dataset_with_runs.read_dataframe()
 
     # assert counts reflect all records from dataset, no deduping
     assert all_records_df.source.value_counts().to_dict() == {"alma": 254, "dspace": 194}
@@ -358,9 +360,10 @@ def test_dataset_all_records_not_current_and_not_deduped(local_dataset_with_runs
     assert all_records_df.run_date.max() == date(2025, 2, 5)
 
 
-def test_dataset_all_current_records_deduped(local_dataset_with_runs):
-    local_dataset_with_runs.load(current_records=True)
-    all_records_df = local_dataset_with_runs.read_dataframe()
+@pytest.mark.skip(reason="All tests for 'current' records will be reworked.")
+def test_dataset_all_current_records_deduped(dataset_with_runs):
+    dataset_with_runs.load(current_records=True)
+    all_records_df = dataset_with_runs.read_dataframe()
 
     # assert both sources have accurate record counts for current records only
     assert all_records_df.source.value_counts().to_dict() == {"dspace": 90, "alma": 100}
@@ -373,9 +376,10 @@ def test_dataset_all_current_records_deduped(local_dataset_with_runs):
     assert all_records_df.run_date.max() == date(2025, 2, 5)  # dspace
 
 
-def test_dataset_source_current_records_deduped(local_dataset_with_runs):
-    local_dataset_with_runs.load(current_records=True, source="alma")
-    alma_records_df = local_dataset_with_runs.read_dataframe()
+@pytest.mark.skip(reason="All tests for 'current' records will be reworked.")
+def test_dataset_source_current_records_deduped(dataset_with_runs):
+    dataset_with_runs.load(current_records=True, source="alma")
+    alma_records_df = dataset_with_runs.read_dataframe()
 
     # assert only alma records present and correct count
     assert alma_records_df.source.value_counts().to_dict() == {"alma": 100}
@@ -388,36 +392,40 @@ def test_dataset_source_current_records_deduped(local_dataset_with_runs):
     assert alma_records_df.run_date.max() == date(2025, 1, 5)
 
 
+@pytest.mark.skip(reason="All tests for 'current' records will be reworked.")
 def test_dataset_all_read_methods_get_deduplication(
-    local_dataset_with_runs,
+    dataset_with_runs,
 ):
-    local_dataset_with_runs.load(current_records=True, source="alma")
+    dataset_with_runs.load(current_records=True, source="alma")
 
-    full_df = local_dataset_with_runs.read_dataframe()
-    all_records = list(local_dataset_with_runs.read_dicts_iter())
-    transformed_records = list(local_dataset_with_runs.read_transformed_records_iter())
+    full_df = dataset_with_runs.read_dataframe()
+    all_records = list(dataset_with_runs.read_dicts_iter())
+    transformed_records = list(dataset_with_runs.read_transformed_records_iter())
 
     assert len(full_df) == len(all_records) == len(transformed_records)
 
 
+@pytest.mark.skip(reason="All tests for 'current' records will be reworked.")
 def test_dataset_current_records_no_additional_filtering_accurate_records_yielded(
-    local_dataset_with_runs,
+    dataset_with_runs,
 ):
-    local_dataset_with_runs.load(current_records=True, source="alma")
-    df = local_dataset_with_runs.read_dataframe()
+    dataset_with_runs.load(current_records=True, source="alma")
+    df = dataset_with_runs.read_dataframe()
     assert df.action.value_counts().to_dict() == {"index": 99, "delete": 1}
 
 
+@pytest.mark.skip(reason="All tests for 'current' records will be reworked.")
 def test_dataset_current_records_action_filtering_accurate_records_yielded(
-    local_dataset_with_runs,
+    dataset_with_runs,
 ):
-    local_dataset_with_runs.load(current_records=True, source="alma")
-    df = local_dataset_with_runs.read_dataframe(action="index")
+    dataset_with_runs.load(current_records=True, source="alma")
+    df = dataset_with_runs.read_dataframe(action="index")
     assert df.action.value_counts().to_dict() == {"index": 99}
 
 
+@pytest.mark.skip(reason="All tests for 'current' records will be reworked.")
 def test_dataset_current_records_index_filtering_accurate_records_yielded(
-    local_dataset_with_runs,
+    dataset_with_runs,
 ):
     """This is a somewhat complex test, but demonstrates that only 'current' records
     are yielded when .load(current_records=True) is applied.
@@ -437,14 +445,14 @@ def test_dataset_current_records_index_filtering_accurate_records_yielded(
     "influenced" what records we would see as we continue backwards in time.
     """
     # with current_records=False, we get all 25 records from run-5
-    local_dataset_with_runs.load(current_records=False, source="alma")
-    df = local_dataset_with_runs.read_dataframe(run_id="run-5")
+    dataset_with_runs.load(current_records=False, source="alma")
+    df = dataset_with_runs.read_dataframe(run_id="run-5")
     assert len(df) == 25
 
     # with current_records=True, we only get 15 records from run-5
     # because newer run-6 influenced what records are current for older run-5
-    local_dataset_with_runs.load(current_records=True, source="alma")
-    df = local_dataset_with_runs.read_dataframe(run_id="run-5")
+    dataset_with_runs.load(current_records=True, source="alma")
+    df = dataset_with_runs.read_dataframe(run_id="run-5")
     assert len(df) == 15
     assert list(df.timdex_record_id) == [
         "alma:10",
@@ -465,6 +473,7 @@ def test_dataset_current_records_index_filtering_accurate_records_yielded(
     ]
 
 
+@pytest.mark.skip(reason="All tests for 'current' records will be reworked.")
 def test_dataset_load_current_records_gets_correct_same_day_full_run(
     dataset_with_same_day_runs,
 ):
@@ -477,6 +486,7 @@ def test_dataset_load_current_records_gets_correct_same_day_full_run(
     assert list(df.run_id.unique()) == ["run-2"]
 
 
+@pytest.mark.skip(reason="All tests for 'current' records will be reworked.")
 def test_dataset_load_current_records_gets_correct_same_day_daily_runs_ordering(
     dataset_with_same_day_runs,
 ):
diff --git a/tests/test_metadata.py b/tests/test_metadata.py
@@ -1,89 +1,46 @@
-# ruff: noqa: PLR2004
+from duckdb import DuckDBPyConnection
 
-import duckdb
+from timdex_dataset_api import TIMDEXDatasetMetadata
 
-from timdex_dataset_api import TIMDEXDataset, TIMDEXDatasetMetadata
 
+def test_tdm_init_no_metadata_file_warning_success(caplog, dataset_with_runs_location):
+    tdm = TIMDEXDatasetMetadata(dataset_with_runs_location)
 
-def test_tdm_init_from_timdex_dataset_instance_success(dataset_with_same_day_runs):
-    tdm = TIMDEXDatasetMetadata(timdex_dataset=dataset_with_same_day_runs)
-    assert isinstance(tdm.timdex_dataset, TIMDEXDataset)
+    assert tdm.conn is None
+    assert "Static metadata database not found" in caplog.text
 
 
-def test_tdm_init_from_timdex_dataset_path_success(dataset_with_runs_location):
-    tdm = TIMDEXDatasetMetadata.from_dataset_location(dataset_with_runs_location)
-    assert isinstance(tdm.timdex_dataset, TIMDEXDataset)
+def test_tdm_local_dataset_structure_properties():
+    local_root = "/path/to/nothing"
+    tdm_local = TIMDEXDatasetMetadata(local_root)
+    assert tdm_local.location == local_root
+    assert tdm_local.location_scheme == "file"
 
 
-def test_tdm_default_database_location_in_memory(timdex_dataset_metadata):
-    assert timdex_dataset_metadata.db_path == ":memory:"
-    result = timdex_dataset_metadata.conn.query("PRAGMA database_list;").fetchone()
-    assert result[1] == "memory"  # name of database
-    assert result[2] is None  # file associated with database, where None is memory
+def test_tdm_s3_dataset_structure_properties(mocked_timdex_bucket):
+    s3_root = "s3://timdex/dataset"
+    tdm_s3 = TIMDEXDatasetMetadata(s3_root)
+    assert tdm_s3.location == s3_root
+    assert tdm_s3.location_scheme == "s3"
 
 
-def test_tdm_explicit_database_in_file(tmp_path, dataset_with_runs_location):
-    db_path = str(tmp_path / "tda.duckdb")
-    tdm = TIMDEXDatasetMetadata.from_dataset_location(
-        dataset_with_runs_location,
-        db_path=db_path,
-    )
-    assert tdm.db_path == db_path
-    result = tdm.conn.query("PRAGMA database_list;").fetchone()
-    assert result[1] == "tda"  # name of database
-    assert result[2] == db_path  # filepath passed during init
+def test_tdm_create_metadata_database_file_success(caplog, timdex_dataset_metadata_empty):
+    caplog.set_level("DEBUG")
+    timdex_dataset_metadata_empty.recreate_static_database_file()
 
 
-def test_tdm_get_duckdb_connection(timdex_dataset_metadata):
-    conn = timdex_dataset_metadata.get_connection()
-    assert isinstance(conn, duckdb.DuckDBPyConnection)
+def test_tdm_init_metadata_file_found_success(timdex_dataset_metadata):
+    assert isinstance(timdex_dataset_metadata.conn, DuckDBPyConnection)
 
 
-def test_tdm_set_threads(timdex_dataset_metadata):
-    # set to 64
-    timdex_dataset_metadata.set_database_thread_usage(64)
-    sixty_four_thread_count = timdex_dataset_metadata.conn.query(
-        """SELECT current_setting('threads');"""
-    ).fetchone()[0]
-    assert sixty_four_thread_count == 64
+def test_tdm_connection_has_static_database_attached(timdex_dataset_metadata):
+    assert set(
+        timdex_dataset_metadata.conn.query("""show databases;""").to_df().database_name
+    ) == {"memory", "static_db"}
 
-    # set to 12
-    timdex_dataset_metadata.set_database_thread_usage(12)
-    sixty_four_thread_count = timdex_dataset_metadata.conn.query(
-        """SELECT current_setting('threads');"""
-    ).fetchone()[0]
-    assert sixty_four_thread_count == 12
 
-
-def test_tdm_init_sets_up_database(timdex_dataset_metadata):
-    df = timdex_dataset_metadata.conn.query("show tables;").to_df()
-    assert set(df.name) == {"current_records", "records"}
-
-
-def test_tdm_get_current_parquet_files(timdex_dataset_metadata):
-    parquet_files = timdex_dataset_metadata.get_current_parquet_files()
-    # assert 5 total parquet files in dataset
-    # but only 3 contain current records
-    assert len(timdex_dataset_metadata.timdex_dataset.dataset.files) == 5
-    assert len(parquet_files) == 3
-
-
-def test_tdm_get_record_to_run_mapping(timdex_dataset_metadata):
-    record_map = timdex_dataset_metadata.get_current_record_to_run_map()
-
-    assert len(record_map) == 75
-    assert record_map["alma:0"] == "run-5"
-    assert record_map["alma:5"] == "run-4"
-    assert record_map["alma:19"] == "run-4"
-    assert "run-3" not in record_map.values()
-    assert record_map["alma:20"] == "run-2"
-
-
-def test_tdm_current_records_subset_of_all_records(timdex_dataset_metadata):
-    records_df = timdex_dataset_metadata.conn.query("select * from records;").to_df()
-    current_records_df = timdex_dataset_metadata.conn.query(
-        "select * from current_records;"
+def test_tdm_connection_static_database_records_table_exists(timdex_dataset_metadata):
+    records_df = timdex_dataset_metadata.conn.query(
+        """select * from static_db.records;"""
     ).to_df()
-    assert set(current_records_df.timdex_record_id).issubset(
-        set(records_df.timdex_record_id)
-    )
+    assert len(records_df) > 0
diff --git a/tests/test_s3client.py b/tests/test_s3client.py
@@ -42,7 +42,7 @@ def test_split_s3_uri_invalid():
         client._split_s3_uri("timdex/path/to/file.txt")
 
 
-def test_upload_download_file(mock_s3_resource, tmp_path):
+def test_upload_download_file(mocked_timdex_bucket, tmp_path):
     """Test upload_file and download_file methods."""
     client = S3Client()
 
@@ -62,7 +62,7 @@ def test_upload_download_file(mock_s3_resource, tmp_path):
     assert download_path.read_text() == "test content"
 
 
-def test_delete_file(mock_s3_resource, tmp_path):
+def test_delete_file(mocked_timdex_bucket, tmp_path):
     """Test delete_file method."""
     client = S3Client()
 
@@ -76,12 +76,12 @@ def test_delete_file(mock_s3_resource, tmp_path):
     client.delete_file(s3_uri)
 
     # Verify the file is deleted
-    bucket = mock_s3_resource.Bucket("timdex")
+    bucket = mocked_timdex_bucket.Bucket("timdex")
     objects = list(bucket.objects.all())
     assert len(objects) == 0
 
 
-def test_delete_folder(mock_s3_resource, tmp_path):
+def test_delete_folder(mocked_timdex_bucket, tmp_path):
     """Test delete_folder method."""
     client = S3Client()
 
@@ -104,7 +104,7 @@ def test_delete_folder(mock_s3_resource, tmp_path):
     assert len(deleted_keys) == 3
     assert all(key.startswith("folder/") for key in deleted_keys)
 
-    bucket = mock_s3_resource.Bucket("timdex")
+    bucket = mocked_timdex_bucket.Bucket("timdex")
     objects = list(bucket.objects.all())
     assert len(objects) == 1
     assert objects[0].key == "other.txt"
diff --git a/tests/test_write.py b/tests/test_write.py
@@ -52,6 +52,9 @@ def test_dataset_write_record_batches_uses_batch_size(
     )
 
 
+@pytest.mark.skip(
+    reason="Test unneeded soon when list[str] not supported for dataset location."
+)
 def test_dataset_write_to_multiple_locations_raise_error(sample_records_iter):
     timdex_dataset = TIMDEXDataset(
         location=["/path/to/records-1.parquet", "/path/to/records-2.parquet"]
diff --git a/timdex_dataset_api/dataset.py b/timdex_dataset_api/dataset.py
diff --git a/timdex_dataset_api/metadata.py b/timdex_dataset_api/metadata.py
diff --git a/timdex_dataset_api/utils.py b/timdex_dataset_api/utils.py

Original file line number	Diff line number	Diff line change
`@@ -52,6 +52,9 @@ def test_dataset_write_record_batches_uses_batch_size(`
`52`	`52`	`)`
`53`	`53`
`54`	`54`
	`55`	`+@pytest.mark.skip(`
	`56`	`+ reason="Test unneeded soon when list[str] not supported for dataset location."`
	`57`	`+)`
`55`	`58`	`def test_dataset_write_to_multiple_locations_raise_error(sample_records_iter):`
`56`	`59`	`timdex_dataset = TIMDEXDataset(`
`57`	`60`	`location=["/path/to/records-1.parquet", "/path/to/records-2.parquet"]`