@@ -35,7 +35,7 @@ def test_dataset_load_local_sets_filesystem_and_dataset_success(
3535 result = timdex_dataset .load ()
3636
3737 mock_pyarrow_ds .assert_called_once_with (
38- "local/path/to/dataset/ " ,
38+ "local/path/to/dataset" ,
3939 schema = timdex_dataset .schema ,
4040 format = "parquet" ,
4141 partitioning = "hive" ,
@@ -59,7 +59,7 @@ def test_dataset_load_s3_sets_filesystem_and_dataset_success(
5959
6060 mock_get_s3_fs .assert_called_once ()
6161 mock_pyarrow_ds .assert_called_once_with (
62- "bucket/path/to/dataset/ " ,
62+ "bucket/path/to/dataset" ,
6363 schema = timdex_dataset .schema ,
6464 format = "parquet" ,
6565 partitioning = "hive" ,
@@ -69,60 +69,55 @@ def test_dataset_load_s3_sets_filesystem_and_dataset_success(
6969 assert result is None
7070
7171
72- @patch ("timdex_dataset_api.dataset.fs.LocalFileSystem" )
73- @patch ("timdex_dataset_api.dataset.ds.dataset" )
74- def test_dataset_load_with_partition_prefix_via_run_date_success (
75- mock_pyarrow_ds , mock_local_fs
76- ):
77- mock_local_fs .return_value = MagicMock ()
78- mock_pyarrow_ds .return_value = MagicMock ()
72+ def test_dataset_load_without_filters_success (fixed_local_dataset ):
73+ fixed_local_dataset .load ()
7974
80- timdex_dataset = TIMDEXDataset ( location = "local/path/to/dataset" )
81- timdex_dataset . load ( run_date = "2024-12-01" )
75+ assert os . path . exists ( fixed_local_dataset . location )
76+ assert fixed_local_dataset . row_count == 5_000 # noqa: PLR2004
8277
83- mock_pyarrow_ds .assert_called_once_with (
84- "local/path/to/dataset/year=2024/month=12/day=01" ,
85- schema = timdex_dataset .schema ,
86- format = "parquet" ,
87- partitioning = "hive" ,
88- filesystem = mock_local_fs .return_value ,
89- )
9078
79+ def test_dataset_load_with_run_date_str_filters_success (fixed_local_dataset ):
80+ fixed_local_dataset .load (run_date = "2024-12-01" )
9181
92- @patch ("timdex_dataset_api.dataset.fs.LocalFileSystem" )
93- @patch ("timdex_dataset_api.dataset.ds.dataset" )
94- def test_dataset_load_with_partition_prefix_via_run_date_components_success (
95- mock_pyarrow_ds , mock_local_fs
96- ):
97- mock_local_fs .return_value = MagicMock ()
98- mock_pyarrow_ds .return_value = MagicMock ()
82+ assert os .path .exists (fixed_local_dataset .location )
83+ assert fixed_local_dataset .row_count == 5_000 # noqa: PLR2004
9984
100- timdex_dataset = TIMDEXDataset (location = "local/path/to/dataset" )
101- timdex_dataset .load (year = "2024" )
10285
103- mock_pyarrow_ds .assert_called_once_with (
104- "local/path/to/dataset/year=2024" ,
105- schema = timdex_dataset .schema ,
106- format = "parquet" ,
107- partitioning = "hive" ,
108- filesystem = mock_local_fs .return_value ,
109- )
86+ def test_dataset_load_with_run_date_obj_filters_success (fixed_local_dataset ):
87+ fixed_local_dataset .load (run_date = date (2024 , 12 , 1 ))
88+
89+ assert os .path .exists (fixed_local_dataset .location )
90+ assert fixed_local_dataset .row_count == 5_000 # noqa: PLR2004
11091
11192
112- def test_dataset_load_no_filters_success (fixed_local_dataset ):
113- fixed_local_dataset .load ()
93+ def test_dataset_load_with_ymd_filters_success (fixed_local_dataset ):
94+ fixed_local_dataset .load (year = "2024" , month = "12" , day = "01" )
11495
11596 assert os .path .exists (fixed_local_dataset .location )
11697 assert fixed_local_dataset .row_count == 5_000 # noqa: PLR2004
11798
11899
119- def test_dataset_load_and_filter_by_non_partition_field_success (fixed_local_dataset ):
100+ def test_dataset_load_with_single_nonpartition_filters_success (fixed_local_dataset ):
120101 fixed_local_dataset .load (timdex_record_id = "alma:0" )
121102
122103 assert fixed_local_dataset .row_count == 1
123104
124105
125- def test_dataset_get_filtered_dataset_by_all_fields_success (fixed_local_dataset ):
106+ def test_dataset_load_with_multi_nonpartition_filters_success (fixed_local_dataset ):
107+ fixed_local_dataset .load (
108+ timdex_record_id = "alma:0" ,
109+ source = "alma" ,
110+ run_type = "daily" ,
111+ run_id = "abc123" ,
112+ action = "index" ,
113+ )
114+
115+ assert fixed_local_dataset .row_count == 1
116+
117+
118+ def test_dataset_get_filtered_dataset_with_multi_nonpartition_filters_success (
119+ fixed_local_dataset ,
120+ ):
126121 fixed_local_dataset .load () # initial load dataset, no filters passed
127122
128123 filtered_local_dataset = fixed_local_dataset ._get_filtered_dataset (
@@ -138,7 +133,9 @@ def test_dataset_get_filtered_dataset_by_all_fields_success(fixed_local_dataset)
138133 assert filtered_local_df ["timdex_record_id" ].iloc [0 ] == "alma:0"
139134
140135
141- def test_dataset_get_filtered_dataset_by_single_fields_success (fixed_local_dataset ):
136+ def test_dataset_get_filtered_dataset_with_single_nonpartition_success (
137+ fixed_local_dataset ,
138+ ):
142139 fixed_local_dataset .load () # initial load dataset, no filters passed
143140
144141 filtered_local_dataset = fixed_local_dataset ._get_filtered_dataset (
@@ -152,7 +149,7 @@ def test_dataset_get_filtered_dataset_by_single_fields_success(fixed_local_datas
152149 assert filtered_local_df ["run_id" ].unique () == ["abc123" ]
153150
154151
155- def test_dataset_get_filtered_dataset_by_run_date_str_successs (fixed_local_dataset ):
152+ def test_dataset_get_filtered_dataset_with_run_date_str_successs (fixed_local_dataset ):
156153 fixed_local_dataset .load () # initial load dataset, no filters passed
157154
158155 filtered_local_dataset = fixed_local_dataset ._get_filtered_dataset (
@@ -166,7 +163,7 @@ def test_dataset_get_filtered_dataset_by_run_date_str_successs(fixed_local_datas
166163 assert empty_local_dataset .count_rows () == 0
167164
168165
169- def test_dataset_get_filtered_dataset_by_run_date_date_success (fixed_local_dataset ):
166+ def test_dataset_get_filtered_dataset_with_run_date_obj_success (fixed_local_dataset ):
170167 fixed_local_dataset .load () # initial load dataset, no filters passed
171168
172169 filtered_local_dataset = fixed_local_dataset ._get_filtered_dataset (
@@ -182,7 +179,7 @@ def test_dataset_get_filtered_dataset_by_run_date_date_success(fixed_local_datas
182179 assert empty_local_dataset .count_rows () == 0
183180
184181
185- def test_dataset_get_filtered_dataset_by_run_date_components_success (fixed_local_dataset ):
182+ def test_dataset_get_filtered_dataset_with_ymd_success (fixed_local_dataset ):
186183 fixed_local_dataset .load () # initial load dataset, no filters passed
187184
188185 filtered_local_dataset = fixed_local_dataset ._get_filtered_dataset (year = "2024" )
@@ -194,13 +191,13 @@ def test_dataset_get_filtered_dataset_by_run_date_components_success(fixed_local
194191 assert empty_local_dataset .count_rows () == 0
195192
196193
197- def test_dataset_get_filtered_dataset_by_run_date_if_invalid_type_raise_error (
194+ def test_dataset_get_filtered_dataset_with_run_date_invalid_raise_error (
198195 fixed_local_dataset ,
199196):
200197 fixed_local_dataset .load () # initial load dataset, no filters passed
201198
202199 with pytest .raises (
203- ValueError ,
200+ TypeError ,
204201 match = (
205202 "Provided 'run_date' value must be a string matching format '%Y-%m-%d' "
206203 "or a datetime.date."
@@ -209,36 +206,6 @@ def test_dataset_get_filtered_dataset_by_run_date_if_invalid_type_raise_error(
209206 _ = fixed_local_dataset ._get_filtered_dataset (run_date = 999 )
210207
211208
212- def test_dataset_get_partition_prefixes_with_run_date_success ():
213- timdex_dataset = TIMDEXDataset (location = "s3://bucket/path/to/dataset" )
214-
215- assert (
216- timdex_dataset ._get_partition_prefixes (run_date = "2024-12-01" )
217- == "year=2024/month=12/day=01"
218- )
219-
220-
221- def test_dataset_get_partition_prefixes_without_run_date_success ():
222- timdex_dataset = TIMDEXDataset (location = "s3://bucket/path/to/dataset" )
223-
224- assert (
225- timdex_dataset ._get_partition_prefixes (year = "2024" , month = "12" , day = "01" )
226- ) == "year=2024/month=12/day=01"
227- assert (
228- timdex_dataset ._get_partition_prefixes (year = "2024" , month = "12" )
229- == "year=2024/month=12"
230- )
231- assert timdex_dataset ._get_partition_prefixes (year = "2024" ) == "year=2024"
232-
233-
234- def test_dataset_get_partition_prefixes_without_run_date_raise_error ():
235- timdex_dataset = TIMDEXDataset (location = "s3://bucket/path/to/dataset" )
236- with pytest .raises (
237- ValueError , match = "Insufficient arguments to construct a valid partition prefix."
238- ):
239- assert timdex_dataset ._get_partition_prefixes (month = "12" , day = "01" )
240-
241-
242209def test_dataset_get_s3_filesystem_success (mocker ):
243210 mocked_s3_filesystem = mocker .spy (fs , "S3FileSystem" )
244211 s3_filesystem = TIMDEXDataset .get_s3_filesystem ()
0 commit comments