Begin rebuilding of data and metadata tests

ghukill · ghukill · commit 7f7800dd7868 · 2025-08-04T11:42:32.000-04:00
Why these changes are being introduced: With the big changes to TIMDEXMetadataDataset comes the need to virtually rewrite the test suite for that class. The changes too TIMDEXMetadataDataset are also influencing tests for TIMDEXDataset, both how its loaded and tested for 'current' record reading. How this addresses that need: This begins with some basic tests around the loading, creating, and attaching of a static database file for TIMDEXMetadataDataset. Future tests will more fully exercise the final views and tables created. This commit also *temporarily* skips a bunch of tests for TIMDEXDataset that will not pass until the ability to limit to 'current' records is reinsated with the updated TIMDEXMetadataDataset. Side effects of this change: * Test suite passes, but multiple tests are temporarily skipped. Relevant ticket(s): * https://mitlibraries.atlassian.net/browse/TIMX-530
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -137,7 +137,7 @@ def dataset_with_runs_location(tmp_path) -> str:
 
 
 @pytest.fixture
-def local_dataset_with_runs(dataset_with_runs_location) -> TIMDEXDataset:
+def dataset_with_runs(dataset_with_runs_location) -> TIMDEXDataset:
     return TIMDEXDataset(dataset_with_runs_location)
 
 
@@ -195,19 +195,26 @@ def dataset_with_same_day_runs(tmp_path) -> TIMDEXDataset:
     return timdex_dataset
 
 
-@pytest.fixture
-def timdex_dataset_metadata(dataset_with_same_day_runs):
-    return TIMDEXDatasetMetadata(timdex_dataset=dataset_with_same_day_runs)
-
-
 @pytest.fixture
 def timdex_bucket():
     return "timdex"
 
 
 @pytest.fixture
-def mock_s3_resource(timdex_bucket):
+def mocked_timdex_bucket(timdex_bucket):
     with moto.mock_aws():
         conn = boto3.resource("s3", region_name="us-east-1")
         conn.create_bucket(Bucket=timdex_bucket)
         yield conn
+
+
+@pytest.fixture
+def timdex_dataset_metadata_empty(dataset_with_runs_location):
+    return TIMDEXDatasetMetadata(dataset_with_runs_location)
+
+
+@pytest.fixture
+def timdex_dataset_metadata(dataset_with_runs_location):
+    tdm = TIMDEXDatasetMetadata(dataset_with_runs_location)
+    tdm.recreate_static_database_file()
+    return tdm
diff --git a/tests/test_dataset.py b/tests/test_dataset.py
@@ -137,6 +137,7 @@ def test_dataset_load_with_multi_nonpartition_filters_success(fixed_local_datase
     assert fixed_local_dataset.row_count == 1
 
 
+@pytest.mark.skip(reason="All tests for 'current' records will be reworked.")
 def test_dataset_load_current_records_all_sources_success(dataset_with_runs_location):
     timdex_dataset = TIMDEXDataset(dataset_with_runs_location)
 
@@ -149,6 +150,7 @@ def test_dataset_load_current_records_all_sources_success(dataset_with_runs_loca
     assert len(timdex_dataset.dataset.files) == 12
 
 
+@pytest.mark.skip(reason="All tests for 'current' records will be reworked.")
 def test_dataset_load_current_records_one_source_success(dataset_with_runs_location):
     timdex_dataset = TIMDEXDataset(dataset_with_runs_location)
     timdex_dataset.load(current_records=True, source="alma")
@@ -346,9 +348,9 @@ def test_dataset_local_dataset_row_count_missing_dataset_raise_error(local_datas
         _ = td.row_count
 
 
-def test_dataset_all_records_not_current_and_not_deduped(local_dataset_with_runs):
-    local_dataset_with_runs.load()
-    all_records_df = local_dataset_with_runs.read_dataframe()
+def test_dataset_all_records_not_current_and_not_deduped(dataset_with_runs):
+    dataset_with_runs.load()
+    all_records_df = dataset_with_runs.read_dataframe()
 
     # assert counts reflect all records from dataset, no deduping
     assert all_records_df.source.value_counts().to_dict() == {"alma": 254, "dspace": 194}
@@ -358,9 +360,10 @@ def test_dataset_all_records_not_current_and_not_deduped(local_dataset_with_runs
     assert all_records_df.run_date.max() == date(2025, 2, 5)
 
 
-def test_dataset_all_current_records_deduped(local_dataset_with_runs):
-    local_dataset_with_runs.load(current_records=True)
-    all_records_df = local_dataset_with_runs.read_dataframe()
+@pytest.mark.skip(reason="All tests for 'current' records will be reworked.")
+def test_dataset_all_current_records_deduped(dataset_with_runs):
+    dataset_with_runs.load(current_records=True)
+    all_records_df = dataset_with_runs.read_dataframe()
 
     # assert both sources have accurate record counts for current records only
     assert all_records_df.source.value_counts().to_dict() == {"dspace": 90, "alma": 100}
@@ -373,9 +376,10 @@ def test_dataset_all_current_records_deduped(local_dataset_with_runs):
     assert all_records_df.run_date.max() == date(2025, 2, 5)  # dspace
 
 
-def test_dataset_source_current_records_deduped(local_dataset_with_runs):
-    local_dataset_with_runs.load(current_records=True, source="alma")
-    alma_records_df = local_dataset_with_runs.read_dataframe()
+@pytest.mark.skip(reason="All tests for 'current' records will be reworked.")
+def test_dataset_source_current_records_deduped(dataset_with_runs):
+    dataset_with_runs.load(current_records=True, source="alma")
+    alma_records_df = dataset_with_runs.read_dataframe()
 
     # assert only alma records present and correct count
     assert alma_records_df.source.value_counts().to_dict() == {"alma": 100}
@@ -388,36 +392,40 @@ def test_dataset_source_current_records_deduped(local_dataset_with_runs):
     assert alma_records_df.run_date.max() == date(2025, 1, 5)
 
 
+@pytest.mark.skip(reason="All tests for 'current' records will be reworked.")
 def test_dataset_all_read_methods_get_deduplication(
-    local_dataset_with_runs,
+    dataset_with_runs,
 ):
-    local_dataset_with_runs.load(current_records=True, source="alma")
+    dataset_with_runs.load(current_records=True, source="alma")
 
-    full_df = local_dataset_with_runs.read_dataframe()
-    all_records = list(local_dataset_with_runs.read_dicts_iter())
-    transformed_records = list(local_dataset_with_runs.read_transformed_records_iter())
+    full_df = dataset_with_runs.read_dataframe()
+    all_records = list(dataset_with_runs.read_dicts_iter())
+    transformed_records = list(dataset_with_runs.read_transformed_records_iter())
 
     assert len(full_df) == len(all_records) == len(transformed_records)
 
 
+@pytest.mark.skip(reason="All tests for 'current' records will be reworked.")
 def test_dataset_current_records_no_additional_filtering_accurate_records_yielded(
-    local_dataset_with_runs,
+    dataset_with_runs,
 ):
-    local_dataset_with_runs.load(current_records=True, source="alma")
-    df = local_dataset_with_runs.read_dataframe()
+    dataset_with_runs.load(current_records=True, source="alma")
+    df = dataset_with_runs.read_dataframe()
     assert df.action.value_counts().to_dict() == {"index": 99, "delete": 1}
 
 
+@pytest.mark.skip(reason="All tests for 'current' records will be reworked.")
 def test_dataset_current_records_action_filtering_accurate_records_yielded(
-    local_dataset_with_runs,
+    dataset_with_runs,
 ):
-    local_dataset_with_runs.load(current_records=True, source="alma")
-    df = local_dataset_with_runs.read_dataframe(action="index")
+    dataset_with_runs.load(current_records=True, source="alma")
+    df = dataset_with_runs.read_dataframe(action="index")
     assert df.action.value_counts().to_dict() == {"index": 99}
 
 
+@pytest.mark.skip(reason="All tests for 'current' records will be reworked.")
 def test_dataset_current_records_index_filtering_accurate_records_yielded(
-    local_dataset_with_runs,
+    dataset_with_runs,
 ):
     """This is a somewhat complex test, but demonstrates that only 'current' records
     are yielded when .load(current_records=True) is applied.
@@ -437,14 +445,14 @@ def test_dataset_current_records_index_filtering_accurate_records_yielded(
     "influenced" what records we would see as we continue backwards in time.
     """
     # with current_records=False, we get all 25 records from run-5
-    local_dataset_with_runs.load(current_records=False, source="alma")
-    df = local_dataset_with_runs.read_dataframe(run_id="run-5")
+    dataset_with_runs.load(current_records=False, source="alma")
+    df = dataset_with_runs.read_dataframe(run_id="run-5")
     assert len(df) == 25
 
     # with current_records=True, we only get 15 records from run-5
     # because newer run-6 influenced what records are current for older run-5
-    local_dataset_with_runs.load(current_records=True, source="alma")
-    df = local_dataset_with_runs.read_dataframe(run_id="run-5")
+    dataset_with_runs.load(current_records=True, source="alma")
+    df = dataset_with_runs.read_dataframe(run_id="run-5")
     assert len(df) == 15
     assert list(df.timdex_record_id) == [
         "alma:10",
@@ -465,6 +473,7 @@ def test_dataset_current_records_index_filtering_accurate_records_yielded(
     ]
 
 
+@pytest.mark.skip(reason="All tests for 'current' records will be reworked.")
 def test_dataset_load_current_records_gets_correct_same_day_full_run(
     dataset_with_same_day_runs,
 ):
@@ -477,6 +486,7 @@ def test_dataset_load_current_records_gets_correct_same_day_full_run(
     assert list(df.run_id.unique()) == ["run-2"]
 
 
+@pytest.mark.skip(reason="All tests for 'current' records will be reworked.")
 def test_dataset_load_current_records_gets_correct_same_day_daily_runs_ordering(
     dataset_with_same_day_runs,
 ):
diff --git a/tests/test_metadata.py b/tests/test_metadata.py
@@ -1,42 +1,36 @@
-# ruff: noqa: PLR2004
+from duckdb import DuckDBPyConnection
 
-import duckdb
+from timdex_dataset_api import TIMDEXDatasetMetadata
 
-from timdex_dataset_api import TIMDEXDataset, TIMDEXDatasetMetadata
 
+def test_tdm_init_no_metadata_file_warning_success(caplog, dataset_with_runs_location):
+    tdm = TIMDEXDatasetMetadata(dataset_with_runs_location)
 
-def test_tdm_init_from_timdex_dataset_instance_success(dataset_with_same_day_runs):
-    tdm = TIMDEXDatasetMetadata(timdex_dataset=dataset_with_same_day_runs)
-    assert isinstance(tdm.timdex_dataset, TIMDEXDataset)
+    assert tdm.conn is None
+    assert "Static metadata database not found" in caplog.text
 
 
-def test_tdm_init_from_timdex_dataset_path_success(dataset_with_runs_location):
-    tdm = TIMDEXDatasetMetadata.from_dataset_location(dataset_with_runs_location)
-    assert isinstance(tdm.timdex_dataset, TIMDEXDataset)
+def test_tdm_local_dataset_structure_properties():
+    local_root = "/path/to/nothing"
+    tdm_local = TIMDEXDatasetMetadata(local_root)
+    assert tdm_local.location == local_root
+    assert tdm_local.location_scheme == "file"
 
 
-def test_tdm_default_database_location_in_memory(timdex_dataset_metadata):
-    assert timdex_dataset_metadata.db_path == ":memory:"
-    result = timdex_dataset_metadata.conn.query("PRAGMA database_list;").fetchone()
-    assert result[1] == "memory"  # name of database
-    assert result[2] is None  # file associated with database, where None is memory
+def test_tdm_s3_dataset_structure_properties(mocked_timdex_bucket):
+    s3_root = "s3://timdex/dataset"
+    tdm_s3 = TIMDEXDatasetMetadata(s3_root)
+    assert tdm_s3.location == s3_root
+    assert tdm_s3.location_scheme == "s3"
 
 
-def test_tdm_explicit_database_in_file(tmp_path, dataset_with_runs_location):
-    db_path = str(tmp_path / "tda.duckdb")
-    tdm = TIMDEXDatasetMetadata.from_dataset_location(
-        dataset_with_runs_location,
-        db_path=db_path,
-    )
-    assert tdm.db_path == db_path
-    result = tdm.conn.query("PRAGMA database_list;").fetchone()
-    assert result[1] == "tda"  # name of database
-    assert result[2] == db_path  # filepath passed during init
+def test_tdm_create_metadata_database_file_success(caplog, timdex_dataset_metadata_empty):
+    caplog.set_level("DEBUG")
+    timdex_dataset_metadata_empty.recreate_static_database_file()
 
 
-def test_tdm_get_duckdb_connection(timdex_dataset_metadata):
-    conn = timdex_dataset_metadata.get_connection()
-    assert isinstance(conn, duckdb.DuckDBPyConnection)
+def test_tdm_init_metadata_file_found_success(timdex_dataset_metadata):
+    assert isinstance(timdex_dataset_metadata.conn, DuckDBPyConnection)
 
 
 def test_tdm_connection_has_static_database_attached(timdex_dataset_metadata):
diff --git a/tests/test_s3client.py b/tests/test_s3client.py
@@ -42,7 +42,7 @@ def test_split_s3_uri_invalid():
         client._split_s3_uri("timdex/path/to/file.txt")
 
 
-def test_upload_download_file(mock_s3_resource, tmp_path):
+def test_upload_download_file(mocked_timdex_bucket, tmp_path):
     """Test upload_file and download_file methods."""
     client = S3Client()
 
@@ -62,7 +62,7 @@ def test_upload_download_file(mock_s3_resource, tmp_path):
     assert download_path.read_text() == "test content"
 
 
-def test_delete_file(mock_s3_resource, tmp_path):
+def test_delete_file(mocked_timdex_bucket, tmp_path):
     """Test delete_file method."""
     client = S3Client()
 
@@ -76,12 +76,12 @@ def test_delete_file(mock_s3_resource, tmp_path):
     client.delete_file(s3_uri)
 
     # Verify the file is deleted
-    bucket = mock_s3_resource.Bucket("timdex")
+    bucket = mocked_timdex_bucket.Bucket("timdex")
     objects = list(bucket.objects.all())
     assert len(objects) == 0
 
 
-def test_delete_folder(mock_s3_resource, tmp_path):
+def test_delete_folder(mocked_timdex_bucket, tmp_path):
     """Test delete_folder method."""
     client = S3Client()
 
@@ -104,7 +104,7 @@ def test_delete_folder(mock_s3_resource, tmp_path):
     assert len(deleted_keys) == 3
     assert all(key.startswith("folder/") for key in deleted_keys)
 
-    bucket = mock_s3_resource.Bucket("timdex")
+    bucket = mocked_timdex_bucket.Bucket("timdex")
     objects = list(bucket.objects.all())
     assert len(objects) == 1
     assert objects[0].key == "other.txt"
diff --git a/tests/test_write.py b/tests/test_write.py
@@ -52,6 +52,9 @@ def test_dataset_write_record_batches_uses_batch_size(
     )
 
 
+@pytest.mark.skip(
+    reason="Test unneeded soon when list[str] not supported for dataset location."
+)
 def test_dataset_write_to_multiple_locations_raise_error(sample_records_iter):
     timdex_dataset = TIMDEXDataset(
         location=["/path/to/records-1.parquet", "/path/to/records-2.parquet"]
diff --git a/timdex_dataset_api/metadata.py b/timdex_dataset_api/metadata.py
@@ -1,9 +1,11 @@
 """timdex_dataset_api/metadata.py"""
 
 import os
+import shutil
 import tempfile
 import time
 from pathlib import Path
+from typing import Literal
 from urllib.parse import urlparse
 
 import duckdb
@@ -41,6 +43,15 @@ def __init__(
         self.location = location
         self.conn: None | DuckDBPyConnection = self.setup_duckdb_context()
 
+    @property
+    def location_scheme(self) -> Literal["file", "s3"]:
+        scheme = urlparse(self.location).scheme
+        if scheme == "":
+            return "file"
+        if scheme == "s3":
+            return "s3"
+        raise ValueError(f"Location with scheme type '{scheme}' not supported.")
+
     @property
     def metadata_root(self) -> str:
         return f"{self.location.removesuffix('/')}/metadata"
@@ -59,7 +70,7 @@ def append_deltas_path(self) -> str:
 
     def database_exists(self) -> bool:
         """Check if static metadata database file exists."""
-        if urlparse(self.metadata_database_path).scheme == "s3":
+        if self.location_scheme == "s3":
             s3_client = S3Client()
             return s3_client.object_exists(self.metadata_database_path)
         return os.path.exists(self.metadata_database_path)
@@ -75,10 +86,11 @@ def recreate_static_database_file(self) -> None:
             5. Upload DuckDB database file to target destination, making that the new
             static metadata database file
         """
-        s3_client = S3Client()
-
-        # remove any append deltas that may exist at this time of database recreation
-        s3_client.delete_folder(self.append_deltas_path)
+        if self.location_scheme == "s3":
+            s3_client = S3Client()
+            s3_client.delete_folder(self.append_deltas_path)
+        else:
+            shutil.rmtree(self.append_deltas_path, ignore_errors=True)
 
         # build database locally
         with tempfile.TemporaryDirectory() as temp_dir:
@@ -91,10 +103,18 @@ def recreate_static_database_file(self) -> None:
                 self._create_full_dataset_table(conn)
 
             # copy local database file to remote location
-            s3_client.upload_file(
-                local_db_path,
-                self.metadata_database_path,
-            )
+            if self.location_scheme == "s3":
+                s3_client = S3Client()
+                s3_client.upload_file(
+                    local_db_path,
+                    self.metadata_database_path,
+                )
+            else:
+                Path(self.metadata_database_path).parent.mkdir(
+                    parents=True,
+                    exist_ok=True,
+                )
+                shutil.copy(local_db_path, self.metadata_database_path)
 
         # refresh DuckDB connection
         self.conn = self.setup_duckdb_context()

Original file line number	Diff line number	Diff line change
`@@ -52,6 +52,9 @@ def test_dataset_write_record_batches_uses_batch_size(`
`52`	`52`	`)`
`53`	`53`
`54`	`54`
	`55`	`+@pytest.mark.skip(`
	`56`	`+ reason="Test unneeded soon when list[str] not supported for dataset location."`
	`57`	`+)`
`55`	`58`	`def test_dataset_write_to_multiple_locations_raise_error(sample_records_iter):`
`56`	`59`	`timdex_dataset = TIMDEXDataset(`
`57`	`60`	`location=["/path/to/records-1.parquet", "/path/to/records-2.parquet"]`