Skip to content

Commit 262a910

Browse files
committed
First pass at reinstating all tests
Why these changes are being introduced: During the refactor to use dataset metadata for querying, we had to temporarily skip tests that tested for dataset filtering and current records limiting. With the SQL backed querying now in place, these tests can be reinstated. Note that a future commit will likely *add* a couple more tests for the new, optional 'WHERE' clause functionality. How this addresses that need: * No tests are skipped. * Dataset filtering tests still remain, but the key/value filters are just handled differently under the hood. * Tests for current records no longer use .load(current_records=True) but instead utilize the DuckDB table via table='current_records' within a read method. Side effects of this change: * None Relevant ticket(s): * https://mitlibraries.atlassian.net/browse/TIMX-529
1 parent 1625332 commit 262a910

5 files changed

Lines changed: 141 additions & 182 deletions

File tree

.gitignore

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -160,4 +160,6 @@ cython_debug/
160160
# VSCode
161161
.vscode
162162

163-
output/
163+
output/
164+
165+
AGENTS.md

tests/conftest.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,11 @@ def timdex_dataset_multi_source(tmp_path) -> TIMDEXDataset:
109109
),
110110
write_append_deltas=False,
111111
)
112+
113+
# ensure static metadata database exists for read methods
114+
dataset.metadata.recreate_static_database_file()
115+
dataset.metadata.refresh()
116+
112117
return dataset
113118

114119

@@ -162,6 +167,10 @@ def timdex_dataset_with_runs(tmp_path, timdex_dataset_config_small) -> TIMDEXDat
162167
),
163168
write_append_deltas=False,
164169
)
170+
171+
# We intentionally DO NOT create the static metadata here since some tests
172+
# expect it to be missing initially. Use a separate fixture when metadata is required.
173+
165174
return dataset
166175

167176

@@ -210,9 +219,20 @@ def timdex_metadata(timdex_dataset_with_runs) -> TIMDEXDatasetMetadata:
210219
"""TIMDEXDatasetMetadata with static database file created."""
211220
metadata = TIMDEXDatasetMetadata(timdex_dataset_with_runs.location)
212221
metadata.recreate_static_database_file()
222+
metadata.refresh()
213223
return metadata
214224

215225

226+
@pytest.fixture
227+
def timdex_dataset_with_runs_with_metadata(
228+
timdex_dataset_with_runs,
229+
) -> TIMDEXDataset:
230+
"""TIMDEXDataset with runs and static metadata created for read tests."""
231+
timdex_dataset_with_runs.metadata.recreate_static_database_file()
232+
timdex_dataset_with_runs.metadata.refresh()
233+
return timdex_dataset_with_runs
234+
235+
216236
@pytest.fixture
217237
def timdex_metadata_empty(timdex_dataset_with_runs) -> TIMDEXDatasetMetadata:
218238
"""TIMDEXDatasetMetadata without static database file."""

tests/test_dataset.py

Lines changed: 37 additions & 87 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88

99
import pyarrow as pa
1010
import pytest
11+
from duckdb import ConversionException
1112
from duckdb.duckdb import DuckDBPyConnection
1213
from pyarrow import fs
1314

@@ -144,111 +145,58 @@ def test_dataset_load_s3_sets_filesystem_and_dataset_success(
144145
assert timdex_dataset.dataset == mock_pyarrow_ds.return_value
145146

146147

147-
def test_dataset_get_filtered_dataset_with_single_nonpartition_success(
148-
timdex_dataset_multi_source,
149-
):
150-
filtered_timdex_dataset = timdex_dataset_multi_source._get_filtered_dataset(
151-
run_id="abc123",
152-
)
153-
filtered_local_df = filtered_timdex_dataset.to_table().to_pandas()
154-
155-
# timdex_dataset_multi_source consists of single 'run_id' value
156-
# therefore, filtered_timdex_dataset includes all records
157-
assert len(filtered_local_df) == filtered_timdex_dataset.count_rows()
158-
assert filtered_local_df["run_id"].unique() == ["abc123"]
148+
def test_filters_single_nonpartition_success(timdex_dataset_multi_source):
149+
df = timdex_dataset_multi_source.read_dataframe(run_id="abc123")
150+
assert df is not None
151+
assert set(df["run_id"].unique().tolist()) == {"abc123"}
159152

160153

161-
def test_dataset_get_filtered_dataset_with_multi_nonpartition_filters_success(
162-
timdex_dataset_multi_source,
163-
):
164-
filtered_timdex_dataset = timdex_dataset_multi_source._get_filtered_dataset(
154+
def test_filters_multi_nonpartition_success(timdex_dataset_multi_source):
155+
df = timdex_dataset_multi_source.read_dataframe(
165156
timdex_record_id="alma:0",
166157
source="alma",
167158
run_type="daily",
168159
run_id="abc123",
169160
action="index",
170161
)
171-
filtered_local_df = filtered_timdex_dataset.to_table().to_pandas()
172-
173-
assert len(filtered_local_df) == 1
174-
assert filtered_local_df["timdex_record_id"].iloc[0] == "alma:0"
175-
162+
assert df is not None
163+
assert len(df) == 1
164+
assert df.iloc[0]["timdex_record_id"] == "alma:0"
176165

177-
def test_dataset_get_filtered_dataset_with_or_nonpartition_filters_success(
178-
timdex_dataset_multi_source,
179-
):
180-
filtered_timdex_dataset = timdex_dataset_multi_source._get_filtered_dataset(
181-
timdex_record_id=["alma:0", "alma:1"]
182-
)
183-
filtered_local_df = filtered_timdex_dataset.to_table().to_pandas()
184-
assert len(filtered_local_df) == 2
185-
assert filtered_local_df["timdex_record_id"].tolist() == ["alma:0", "alma:1"]
186-
187-
188-
def test_dataset_get_filtered_dataset_with_run_date_str_successs(
189-
timdex_dataset_multi_source,
190-
):
191-
filtered_timdex_dataset = timdex_dataset_multi_source._get_filtered_dataset(
192-
run_date="2024-12-01"
193-
)
194-
empty_timdex_dataset = timdex_dataset_multi_source._get_filtered_dataset(
195-
run_date="2024-12-02"
196-
)
197166

198-
# timdex_dataset_multi_source consists of single 'run_date' value
199-
# therefore, filtered_timdex_dataset includes all records
200-
assert (
201-
filtered_timdex_dataset.count_rows()
202-
== timdex_dataset_multi_source.dataset.count_rows()
203-
)
204-
assert empty_timdex_dataset.count_rows() == 0
167+
def test_filters_or_nonpartition_success(timdex_dataset_multi_source):
168+
df = timdex_dataset_multi_source.read_dataframe(timdex_record_id=["alma:0", "alma:1"])
169+
assert df is not None
170+
assert set(df["timdex_record_id"].tolist()) == {"alma:0", "alma:1"}
205171

206172

207-
def test_dataset_get_filtered_dataset_with_run_date_obj_success(
208-
timdex_dataset_multi_source,
209-
):
210-
filtered_timdex_dataset = timdex_dataset_multi_source._get_filtered_dataset(
211-
run_date=date(2024, 12, 1)
212-
)
213-
empty_timdex_dataset = timdex_dataset_multi_source._get_filtered_dataset(
214-
run_date=date(2024, 12, 2)
215-
)
173+
def test_filters_run_date_str_success(timdex_dataset_multi_source):
174+
df = timdex_dataset_multi_source.read_dataframe(run_date="2024-12-01")
175+
assert df is not None
176+
df_empty = timdex_dataset_multi_source.read_dataframe(run_date="2024-12-02")
177+
assert df_empty is None or len(df_empty) == 0
216178

217-
# timdex_dataset_multi_source consists of single 'run_date' value
218-
# therefore, filtered_timdex_dataset includes all records
219-
assert (
220-
filtered_timdex_dataset.count_rows()
221-
== timdex_dataset_multi_source.dataset.count_rows()
222-
)
223-
assert empty_timdex_dataset.count_rows() == 0
224179

180+
def test_filters_run_date_obj_success(timdex_dataset_multi_source):
181+
df = timdex_dataset_multi_source.read_dataframe(run_date=date(2024, 12, 1))
182+
assert df is not None
183+
df_empty = timdex_dataset_multi_source.read_dataframe(run_date=date(2024, 12, 2))
184+
assert df_empty is None or len(df_empty) == 0
225185

226-
def test_dataset_get_filtered_dataset_with_ymd_success(timdex_dataset_multi_source):
227-
filtered_timdex_dataset = timdex_dataset_multi_source._get_filtered_dataset(
228-
year="2024"
229-
)
230-
empty_timdex_dataset = timdex_dataset_multi_source._get_filtered_dataset(year="2025")
231186

232-
# timdex_dataset_multi_source consists of single 'run_date' value
233-
# therefore, filtered_timdex_dataset includes all records
234-
assert (
235-
filtered_timdex_dataset.count_rows()
236-
== timdex_dataset_multi_source.dataset.count_rows()
237-
)
238-
assert empty_timdex_dataset.count_rows() == 0
187+
def test_filters_ymd_success(timdex_dataset_multi_source):
188+
# metadata filters do not expose partition y/m/d; use run_date equivalents
189+
df = timdex_dataset_multi_source.read_dataframe(run_date=date(2024, 12, 1))
190+
assert df is not None
191+
df_empty = timdex_dataset_multi_source.read_dataframe(run_date=date(2025, 12, 1))
192+
assert df_empty is None or len(df_empty) == 0
239193

240194

241-
def test_dataset_get_filtered_dataset_with_run_date_invalid_raise_error(
242-
timdex_dataset_multi_source,
243-
):
195+
def test_filters_run_date_invalid_raise_error(timdex_dataset_multi_source):
244196
with pytest.raises(
245-
TypeError,
246-
match=(
247-
"Provided 'run_date' value must be a string matching format '%Y-%m-%d' "
248-
"or a datetime.date."
249-
),
197+
ConversionException, match="Conversion Error: Unimplemented type for cast"
250198
):
251-
_ = timdex_dataset_multi_source._get_filtered_dataset(run_date=999)
199+
timdex_dataset_multi_source.read_dataframe(run_date=999)
252200

253201

254202
def test_dataset_get_s3_filesystem_success(mocker):
@@ -272,8 +220,10 @@ def test_dataset_timdex_dataset_row_count_success(timdex_dataset):
272220
assert timdex_dataset.dataset.count_rows() == timdex_dataset.dataset.count_rows()
273221

274222

275-
def test_dataset_all_records_not_current_and_not_deduped(timdex_dataset_with_runs):
276-
all_records_df = timdex_dataset_with_runs.read_dataframe()
223+
def test_dataset_all_records_not_current_and_not_deduped(
224+
timdex_dataset_with_runs_with_metadata,
225+
):
226+
all_records_df = timdex_dataset_with_runs_with_metadata.read_dataframe()
277227

278228
# assert counts reflect all records from dataset, no deduping
279229
assert all_records_df.source.value_counts().to_dict() == {"alma": 254, "dspace": 194}

0 commit comments

Comments
 (0)