MITLibraries
diff --git a/‎.gitignore‎
Lines changed: 3 additions & 1 deletion b/‎.gitignore‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎tests/conftest.py‎
Lines changed: 20 additions & 0 deletions b/‎tests/conftest.py‎
Lines changed: 20 additions & 0 deletions
diff --git a/‎tests/test_dataset.py‎
Lines changed: 37 additions & 87 deletions b/‎tests/test_dataset.py‎
Lines changed: 37 additions & 87 deletions
@@ -160,4 +160,6 @@ cython_debug/
 # VSCode
 .vscode
 
-output/
+output/
+
+AGENTS.md
@@ -109,6 +109,11 @@ def timdex_dataset_multi_source(tmp_path) -> TIMDEXDataset:
             ),
             write_append_deltas=False,
         )
+
+    # ensure static metadata database exists for read methods
+    dataset.metadata.recreate_static_database_file()
+    dataset.metadata.refresh()
+
     return dataset
 
 
@@ -162,6 +167,10 @@ def timdex_dataset_with_runs(tmp_path, timdex_dataset_config_small) -> TIMDEXDat
             ),
             write_append_deltas=False,
         )
+
+    # We intentionally DO NOT create the static metadata here since some tests
+    # expect it to be missing initially. Use a separate fixture when metadata is required.
+
     return dataset
 
 
@@ -210,9 +219,20 @@ def timdex_metadata(timdex_dataset_with_runs) -> TIMDEXDatasetMetadata:
     """TIMDEXDatasetMetadata with static database file created."""
     metadata = TIMDEXDatasetMetadata(timdex_dataset_with_runs.location)
     metadata.recreate_static_database_file()
+    metadata.refresh()
     return metadata
 
 
+@pytest.fixture
+def timdex_dataset_with_runs_with_metadata(
+    timdex_dataset_with_runs,
+) -> TIMDEXDataset:
+    """TIMDEXDataset with runs and static metadata created for read tests."""
+    timdex_dataset_with_runs.metadata.recreate_static_database_file()
+    timdex_dataset_with_runs.metadata.refresh()
+    return timdex_dataset_with_runs
+
+
 @pytest.fixture
 def timdex_metadata_empty(timdex_dataset_with_runs) -> TIMDEXDatasetMetadata:
     """TIMDEXDatasetMetadata without static database file."""
 
@@ -8,6 +8,7 @@
 
 import pyarrow as pa
 import pytest
+from duckdb import ConversionException
 from duckdb.duckdb import DuckDBPyConnection
 from pyarrow import fs
 
@@ -144,111 +145,58 @@ def test_dataset_load_s3_sets_filesystem_and_dataset_success(
     assert timdex_dataset.dataset == mock_pyarrow_ds.return_value
 
 
-def test_dataset_get_filtered_dataset_with_single_nonpartition_success(
-    timdex_dataset_multi_source,
-):
-    filtered_timdex_dataset = timdex_dataset_multi_source._get_filtered_dataset(
-        run_id="abc123",
-    )
-    filtered_local_df = filtered_timdex_dataset.to_table().to_pandas()
-
-    # timdex_dataset_multi_source consists of single 'run_id' value
-    # therefore, filtered_timdex_dataset includes all records
-    assert len(filtered_local_df) == filtered_timdex_dataset.count_rows()
-    assert filtered_local_df["run_id"].unique() == ["abc123"]
+def test_filters_single_nonpartition_success(timdex_dataset_multi_source):
+    df = timdex_dataset_multi_source.read_dataframe(run_id="abc123")
+    assert df is not None
+    assert set(df["run_id"].unique().tolist()) == {"abc123"}
 
 
-def test_dataset_get_filtered_dataset_with_multi_nonpartition_filters_success(
-    timdex_dataset_multi_source,
-):
-    filtered_timdex_dataset = timdex_dataset_multi_source._get_filtered_dataset(
+def test_filters_multi_nonpartition_success(timdex_dataset_multi_source):
+    df = timdex_dataset_multi_source.read_dataframe(
         timdex_record_id="alma:0",
         source="alma",
         run_type="daily",
         run_id="abc123",
         action="index",
     )
-    filtered_local_df = filtered_timdex_dataset.to_table().to_pandas()
-
-    assert len(filtered_local_df) == 1
-    assert filtered_local_df["timdex_record_id"].iloc[0] == "alma:0"
-
+    assert df is not None
+    assert len(df) == 1
+    assert df.iloc[0]["timdex_record_id"] == "alma:0"
 
-def test_dataset_get_filtered_dataset_with_or_nonpartition_filters_success(
-    timdex_dataset_multi_source,
-):
-    filtered_timdex_dataset = timdex_dataset_multi_source._get_filtered_dataset(
-        timdex_record_id=["alma:0", "alma:1"]
-    )
-    filtered_local_df = filtered_timdex_dataset.to_table().to_pandas()
-    assert len(filtered_local_df) == 2
-    assert filtered_local_df["timdex_record_id"].tolist() == ["alma:0", "alma:1"]
-
-
-def test_dataset_get_filtered_dataset_with_run_date_str_successs(
-    timdex_dataset_multi_source,
-):
-    filtered_timdex_dataset = timdex_dataset_multi_source._get_filtered_dataset(
-        run_date="2024-12-01"
-    )
-    empty_timdex_dataset = timdex_dataset_multi_source._get_filtered_dataset(
-        run_date="2024-12-02"
-    )
 
-    # timdex_dataset_multi_source consists of single 'run_date' value
-    # therefore, filtered_timdex_dataset includes all records
-    assert (
-        filtered_timdex_dataset.count_rows()
-        == timdex_dataset_multi_source.dataset.count_rows()
-    )
-    assert empty_timdex_dataset.count_rows() == 0
+def test_filters_or_nonpartition_success(timdex_dataset_multi_source):
+    df = timdex_dataset_multi_source.read_dataframe(timdex_record_id=["alma:0", "alma:1"])
+    assert df is not None
+    assert set(df["timdex_record_id"].tolist()) == {"alma:0", "alma:1"}
 
 
-def test_dataset_get_filtered_dataset_with_run_date_obj_success(
-    timdex_dataset_multi_source,
-):
-    filtered_timdex_dataset = timdex_dataset_multi_source._get_filtered_dataset(
-        run_date=date(2024, 12, 1)
-    )
-    empty_timdex_dataset = timdex_dataset_multi_source._get_filtered_dataset(
-        run_date=date(2024, 12, 2)
-    )
+def test_filters_run_date_str_success(timdex_dataset_multi_source):
+    df = timdex_dataset_multi_source.read_dataframe(run_date="2024-12-01")
+    assert df is not None
+    df_empty = timdex_dataset_multi_source.read_dataframe(run_date="2024-12-02")
+    assert df_empty is None or len(df_empty) == 0
 
-    # timdex_dataset_multi_source consists of single 'run_date' value
-    # therefore, filtered_timdex_dataset includes all records
-    assert (
-        filtered_timdex_dataset.count_rows()
-        == timdex_dataset_multi_source.dataset.count_rows()
-    )
-    assert empty_timdex_dataset.count_rows() == 0
 
+def test_filters_run_date_obj_success(timdex_dataset_multi_source):
+    df = timdex_dataset_multi_source.read_dataframe(run_date=date(2024, 12, 1))
+    assert df is not None
+    df_empty = timdex_dataset_multi_source.read_dataframe(run_date=date(2024, 12, 2))
+    assert df_empty is None or len(df_empty) == 0
 
-def test_dataset_get_filtered_dataset_with_ymd_success(timdex_dataset_multi_source):
-    filtered_timdex_dataset = timdex_dataset_multi_source._get_filtered_dataset(
-        year="2024"
-    )
-    empty_timdex_dataset = timdex_dataset_multi_source._get_filtered_dataset(year="2025")
 
-    # timdex_dataset_multi_source consists of single 'run_date' value
-    # therefore, filtered_timdex_dataset includes all records
-    assert (
-        filtered_timdex_dataset.count_rows()
-        == timdex_dataset_multi_source.dataset.count_rows()
-    )
-    assert empty_timdex_dataset.count_rows() == 0
+def test_filters_ymd_success(timdex_dataset_multi_source):
+    # metadata filters do not expose partition y/m/d; use run_date equivalents
+    df = timdex_dataset_multi_source.read_dataframe(run_date=date(2024, 12, 1))
+    assert df is not None
+    df_empty = timdex_dataset_multi_source.read_dataframe(run_date=date(2025, 12, 1))
+    assert df_empty is None or len(df_empty) == 0
 
 
-def test_dataset_get_filtered_dataset_with_run_date_invalid_raise_error(
-    timdex_dataset_multi_source,
-):
+def test_filters_run_date_invalid_raise_error(timdex_dataset_multi_source):
     with pytest.raises(
-        TypeError,
-        match=(
-            "Provided 'run_date' value must be a string matching format '%Y-%m-%d' "
-            "or a datetime.date."
-        ),
+        ConversionException, match="Conversion Error: Unimplemented type for cast"
     ):
-        _ = timdex_dataset_multi_source._get_filtered_dataset(run_date=999)
+        timdex_dataset_multi_source.read_dataframe(run_date=999)
 
 
 def test_dataset_get_s3_filesystem_success(mocker):
@@ -272,8 +220,10 @@ def test_dataset_timdex_dataset_row_count_success(timdex_dataset):
     assert timdex_dataset.dataset.count_rows() == timdex_dataset.dataset.count_rows()
 
 
-def test_dataset_all_records_not_current_and_not_deduped(timdex_dataset_with_runs):
-    all_records_df = timdex_dataset_with_runs.read_dataframe()
+def test_dataset_all_records_not_current_and_not_deduped(
+    timdex_dataset_with_runs_with_metadata,
+):
+    all_records_df = timdex_dataset_with_runs_with_metadata.read_dataframe()
 
     # assert counts reflect all records from dataset, no deduping
     assert all_records_df.source.value_counts().to_dict() == {"alma": 254, "dspace": 194}