88
99import pyarrow as pa
1010import pytest
11+ from duckdb import ConversionException
1112from duckdb .duckdb import DuckDBPyConnection
1213from pyarrow import fs
1314
@@ -144,111 +145,58 @@ def test_dataset_load_s3_sets_filesystem_and_dataset_success(
144145 assert timdex_dataset .dataset == mock_pyarrow_ds .return_value
145146
146147
147- def test_dataset_get_filtered_dataset_with_single_nonpartition_success (
148- timdex_dataset_multi_source ,
149- ):
150- filtered_timdex_dataset = timdex_dataset_multi_source ._get_filtered_dataset (
151- run_id = "abc123" ,
152- )
153- filtered_local_df = filtered_timdex_dataset .to_table ().to_pandas ()
154-
155- # timdex_dataset_multi_source consists of single 'run_id' value
156- # therefore, filtered_timdex_dataset includes all records
157- assert len (filtered_local_df ) == filtered_timdex_dataset .count_rows ()
158- assert filtered_local_df ["run_id" ].unique () == ["abc123" ]
148+ def test_filters_single_nonpartition_success (timdex_dataset_multi_source ):
149+ df = timdex_dataset_multi_source .read_dataframe (run_id = "abc123" )
150+ assert df is not None
151+ assert set (df ["run_id" ].unique ().tolist ()) == {"abc123" }
159152
160153
161- def test_dataset_get_filtered_dataset_with_multi_nonpartition_filters_success (
162- timdex_dataset_multi_source ,
163- ):
164- filtered_timdex_dataset = timdex_dataset_multi_source ._get_filtered_dataset (
154+ def test_filters_multi_nonpartition_success (timdex_dataset_multi_source ):
155+ df = timdex_dataset_multi_source .read_dataframe (
165156 timdex_record_id = "alma:0" ,
166157 source = "alma" ,
167158 run_type = "daily" ,
168159 run_id = "abc123" ,
169160 action = "index" ,
170161 )
171- filtered_local_df = filtered_timdex_dataset .to_table ().to_pandas ()
172-
173- assert len (filtered_local_df ) == 1
174- assert filtered_local_df ["timdex_record_id" ].iloc [0 ] == "alma:0"
175-
162+ assert df is not None
163+ assert len (df ) == 1
164+ assert df .iloc [0 ]["timdex_record_id" ] == "alma:0"
176165
177- def test_dataset_get_filtered_dataset_with_or_nonpartition_filters_success (
178- timdex_dataset_multi_source ,
179- ):
180- filtered_timdex_dataset = timdex_dataset_multi_source ._get_filtered_dataset (
181- timdex_record_id = ["alma:0" , "alma:1" ]
182- )
183- filtered_local_df = filtered_timdex_dataset .to_table ().to_pandas ()
184- assert len (filtered_local_df ) == 2
185- assert filtered_local_df ["timdex_record_id" ].tolist () == ["alma:0" , "alma:1" ]
186-
187-
188- def test_dataset_get_filtered_dataset_with_run_date_str_successs (
189- timdex_dataset_multi_source ,
190- ):
191- filtered_timdex_dataset = timdex_dataset_multi_source ._get_filtered_dataset (
192- run_date = "2024-12-01"
193- )
194- empty_timdex_dataset = timdex_dataset_multi_source ._get_filtered_dataset (
195- run_date = "2024-12-02"
196- )
197166
198- # timdex_dataset_multi_source consists of single 'run_date' value
199- # therefore, filtered_timdex_dataset includes all records
200- assert (
201- filtered_timdex_dataset .count_rows ()
202- == timdex_dataset_multi_source .dataset .count_rows ()
203- )
204- assert empty_timdex_dataset .count_rows () == 0
167+ def test_filters_or_nonpartition_success (timdex_dataset_multi_source ):
168+ df = timdex_dataset_multi_source .read_dataframe (timdex_record_id = ["alma:0" , "alma:1" ])
169+ assert df is not None
170+ assert set (df ["timdex_record_id" ].tolist ()) == {"alma:0" , "alma:1" }
205171
206172
207- def test_dataset_get_filtered_dataset_with_run_date_obj_success (
208- timdex_dataset_multi_source ,
209- ):
210- filtered_timdex_dataset = timdex_dataset_multi_source ._get_filtered_dataset (
211- run_date = date (2024 , 12 , 1 )
212- )
213- empty_timdex_dataset = timdex_dataset_multi_source ._get_filtered_dataset (
214- run_date = date (2024 , 12 , 2 )
215- )
173+ def test_filters_run_date_str_success (timdex_dataset_multi_source ):
174+ df = timdex_dataset_multi_source .read_dataframe (run_date = "2024-12-01" )
175+ assert df is not None
176+ df_empty = timdex_dataset_multi_source .read_dataframe (run_date = "2024-12-02" )
177+ assert df_empty is None or len (df_empty ) == 0
216178
217- # timdex_dataset_multi_source consists of single 'run_date' value
218- # therefore, filtered_timdex_dataset includes all records
219- assert (
220- filtered_timdex_dataset .count_rows ()
221- == timdex_dataset_multi_source .dataset .count_rows ()
222- )
223- assert empty_timdex_dataset .count_rows () == 0
224179
180+ def test_filters_run_date_obj_success (timdex_dataset_multi_source ):
181+ df = timdex_dataset_multi_source .read_dataframe (run_date = date (2024 , 12 , 1 ))
182+ assert df is not None
183+ df_empty = timdex_dataset_multi_source .read_dataframe (run_date = date (2024 , 12 , 2 ))
184+ assert df_empty is None or len (df_empty ) == 0
225185
226- def test_dataset_get_filtered_dataset_with_ymd_success (timdex_dataset_multi_source ):
227- filtered_timdex_dataset = timdex_dataset_multi_source ._get_filtered_dataset (
228- year = "2024"
229- )
230- empty_timdex_dataset = timdex_dataset_multi_source ._get_filtered_dataset (year = "2025" )
231186
232- # timdex_dataset_multi_source consists of single 'run_date' value
233- # therefore, filtered_timdex_dataset includes all records
234- assert (
235- filtered_timdex_dataset .count_rows ()
236- == timdex_dataset_multi_source .dataset .count_rows ()
237- )
238- assert empty_timdex_dataset .count_rows () == 0
187+ def test_filters_ymd_success (timdex_dataset_multi_source ):
188+ # metadata filters do not expose partition y/m/d; use run_date equivalents
189+ df = timdex_dataset_multi_source .read_dataframe (run_date = date (2024 , 12 , 1 ))
190+ assert df is not None
191+ df_empty = timdex_dataset_multi_source .read_dataframe (run_date = date (2025 , 12 , 1 ))
192+ assert df_empty is None or len (df_empty ) == 0
239193
240194
241- def test_dataset_get_filtered_dataset_with_run_date_invalid_raise_error (
242- timdex_dataset_multi_source ,
243- ):
195+ def test_filters_run_date_invalid_raise_error (timdex_dataset_multi_source ):
244196 with pytest .raises (
245- TypeError ,
246- match = (
247- "Provided 'run_date' value must be a string matching format '%Y-%m-%d' "
248- "or a datetime.date."
249- ),
197+ ConversionException , match = "Conversion Error: Unimplemented type for cast"
250198 ):
251- _ = timdex_dataset_multi_source ._get_filtered_dataset (run_date = 999 )
199+ timdex_dataset_multi_source .read_dataframe (run_date = 999 )
252200
253201
254202def test_dataset_get_s3_filesystem_success (mocker ):
@@ -272,8 +220,10 @@ def test_dataset_timdex_dataset_row_count_success(timdex_dataset):
272220 assert timdex_dataset .dataset .count_rows () == timdex_dataset .dataset .count_rows ()
273221
274222
275- def test_dataset_all_records_not_current_and_not_deduped (timdex_dataset_with_runs ):
276- all_records_df = timdex_dataset_with_runs .read_dataframe ()
223+ def test_dataset_all_records_not_current_and_not_deduped (
224+ timdex_dataset_with_runs_with_metadata ,
225+ ):
226+ all_records_df = timdex_dataset_with_runs_with_metadata .read_dataframe ()
277227
278228 # assert counts reflect all records from dataset, no deduping
279229 assert all_records_df .source .value_counts ().to_dict () == {"alma" : 254 , "dspace" : 194 }
0 commit comments