1212from timdex_dataset_api .dataset import (
1313 MAX_ROWS_PER_FILE ,
1414 TIMDEX_DATASET_SCHEMA ,
15- DatasetNotLoadedError ,
1615 TIMDEXDataset ,
1716)
1817from timdex_dataset_api .record import DatasetRecord
@@ -30,6 +29,7 @@ def test_dataset_record_init():
3029 "run_id" : "000-111-aaa-bbb" ,
3130 }
3231 record = DatasetRecord (** values )
32+
3333 assert record
3434 assert (record .year , record .month , record .day ) == (
3535 "2024" ,
@@ -49,6 +49,7 @@ def test_dataset_record_init_with_invalid_run_date_raise_error():
4949 "action" : "index" ,
5050 "run_id" : "000-111-aaa-bbb" ,
5151 }
52+
5253 with pytest .raises (
5354 ValueError , match = re .escape ("time data '-12-01' does not match format '%Y-%m-%d'" )
5455 ):
@@ -67,6 +68,7 @@ def test_dataset_record_serialization():
6768 "run_id" : "abc123" ,
6869 }
6970 dataset_record = DatasetRecord (** values )
71+
7072 assert dataset_record .to_dict () == {
7173 "timdex_record_id" : "alma:123" ,
7274 "source_record" : b"<record><title>Hello World.</title></record>" ,
@@ -82,47 +84,38 @@ def test_dataset_record_serialization():
8284 }
8385
8486
85- def test_dataset_write_records_to_new_dataset (new_dataset , sample_records_iter ):
86- files_written = new_dataset .write (sample_records_iter (10_000 ))
87- assert len (files_written ) == 1
88- assert os .path .exists (new_dataset .location )
89-
90- # load newly created dataset as new TIMDEXDataset instance
91- dataset = TIMDEXDataset .load (new_dataset .location )
92- assert dataset .row_count == 10_000
93-
94-
95- def test_dataset_reload_after_write (new_dataset , sample_records_iter ):
96- files_written = new_dataset .write (sample_records_iter (10_000 ))
97- assert len (files_written ) == 1
98- assert os .path .exists (new_dataset .location )
99-
100- # attempt row count before reload
101- with pytest .raises (DatasetNotLoadedError ):
102- _ = new_dataset .row_count
87+ def test_dataset_write_records_to_new_local_dataset (
88+ new_local_dataset , sample_records_iter
89+ ):
90+ written_files = new_local_dataset .write (sample_records_iter (10_000 ))
91+ new_local_dataset .load ()
10392
104- # attempt row count after reload
105- new_dataset . reload ( )
106- assert new_dataset .row_count == 10_000
93+ assert len ( written_files ) == 1
94+ assert os . path . exists ( new_local_dataset . location )
95+ assert new_local_dataset .row_count == 10_000
10796
10897
109- def test_dataset_write_default_max_rows_per_file (new_dataset , sample_records_iter ):
98+ def test_dataset_write_default_max_rows_per_file (new_local_dataset , sample_records_iter ):
11099 """Default is 100k rows per file, therefore writing 200,033 records should result in
111100 3 files (x2 @ 100k rows, x1 @ 33 rows)."""
112101 total_records = 200_033
113102
114- new_dataset .write (sample_records_iter (total_records ))
115- new_dataset . reload ()
103+ new_local_dataset .write (sample_records_iter (total_records ))
104+ new_local_dataset . load ()
116105
117- assert new_dataset .row_count == total_records
118- assert len (new_dataset .dataset .files ) == math .ceil (total_records / MAX_ROWS_PER_FILE )
106+ assert new_local_dataset .row_count == total_records
107+ assert len (new_local_dataset .dataset .files ) == math .ceil (
108+ total_records / MAX_ROWS_PER_FILE
109+ )
119110
120111
121- def test_dataset_write_record_batches_uses_batch_size (new_dataset , sample_records_iter ):
112+ def test_dataset_write_record_batches_uses_batch_size (
113+ new_local_dataset , sample_records_iter
114+ ):
122115 total_records = 101
123116 batch_size = 50
124117 batches = list (
125- new_dataset .get_dataset_record_batches (
118+ new_local_dataset .get_dataset_record_batches (
126119 sample_records_iter (total_records ), batch_size = batch_size
127120 )
128121 )
@@ -140,63 +133,69 @@ def test_dataset_write_to_multiple_locations_raise_error(sample_records_iter):
140133 timdex_dataset .write (sample_records_iter (10 ))
141134
142135
143- def test_dataset_write_schema_applied_to_dataset (new_dataset , sample_records_iter ):
144- new_dataset .write (sample_records_iter (10 ))
136+ def test_dataset_write_schema_applied_to_dataset (new_local_dataset , sample_records_iter ):
137+ new_local_dataset .write (sample_records_iter (10 ))
145138
146139 # manually load dataset to confirm schema without TIMDEXDataset projecting schema
147140 # during load
148141 dataset = ds .dataset (
149- new_dataset .location ,
142+ new_local_dataset .location ,
150143 format = "parquet" ,
151144 partitioning = "hive" ,
152145 )
153146
154147 assert set (dataset .schema .names ) == set (TIMDEX_DATASET_SCHEMA .names )
155148
156149
157- def test_dataset_write_partition_for_single_source (new_dataset , sample_records_iter ):
158- written_files = new_dataset .write (sample_records_iter (10 ))
150+ def test_dataset_write_partition_for_single_source (
151+ new_local_dataset , sample_records_iter
152+ ):
153+ written_files = new_local_dataset .write (sample_records_iter (10 ))
159154 assert len (written_files ) == 1
160- assert os .path .exists (new_dataset .location )
155+ assert os .path .exists (new_local_dataset .location )
161156 assert "year=2024/month=12/day=01" in written_files [0 ].path
162157
163158
164- def test_dataset_write_partition_for_multiple_sources (new_dataset , sample_records_iter ):
159+ def test_dataset_write_partition_for_multiple_sources (
160+ new_local_dataset , sample_records_iter
161+ ):
165162 # perform write for source="alma" and run_date="2024-12-01"
166- written_files_source_a = new_dataset .write (sample_records_iter (10 ))
167- new_dataset . reload ()
163+ written_files_source_a = new_local_dataset .write (sample_records_iter (10 ))
164+ new_local_dataset . load ()
168165
169166 assert os .path .exists (written_files_source_a [0 ].path )
170- assert new_dataset .row_count == 10
167+ assert new_local_dataset .row_count == 10
171168
172169 # perform write for source="libguides" and run_date="2024-12-01"
173- written_files_source_b = new_dataset .write (
170+ written_files_source_b = new_local_dataset .write (
174171 generate_sample_records (
175172 num_records = 7 , timdex_record_id_prefix = "libguides" , source = "libguides"
176173 )
177174 )
178- new_dataset . reload ()
175+ new_local_dataset . load ()
179176
180177 assert os .path .exists (written_files_source_b [0 ].path )
181178 assert os .path .exists (written_files_source_a [0 ].path )
182- assert new_dataset .row_count == 17
179+ assert new_local_dataset .row_count == 17
183180
184181
185- def test_dataset_write_partition_ignore_existing_data (new_dataset , sample_records_iter ):
182+ def test_dataset_write_partition_ignore_existing_data (
183+ new_local_dataset , sample_records_iter
184+ ):
186185 # perform two (2) writes for source="alma" and run_date="2024-12-01"
187- written_files_source_a0 = new_dataset .write (sample_records_iter (10 ))
188- written_files_source_a1 = new_dataset .write (sample_records_iter (10 ))
189- new_dataset . reload ()
186+ written_files_source_a0 = new_local_dataset .write (sample_records_iter (10 ))
187+ written_files_source_a1 = new_local_dataset .write (sample_records_iter (10 ))
188+ new_local_dataset . load ()
190189
191190 # assert that both files exist and no overwriting occurs
192191 assert os .path .exists (written_files_source_a0 [0 ].path )
193192 assert os .path .exists (written_files_source_a1 [0 ].path )
194- assert new_dataset .row_count == 20
193+ assert new_local_dataset .row_count == 20
195194
196195
197196@patch ("timdex_dataset_api.dataset.uuid.uuid4" )
198197def test_dataset_write_partition_overwrite_files_with_same_name (
199- mock_uuid , new_dataset , sample_records_iter
198+ mock_uuid , new_local_dataset , sample_records_iter
200199):
201200 """This test is to demonstrate existing_data_behavior="overwrite_or_ignore".
202201
@@ -207,10 +206,10 @@ def test_dataset_write_partition_overwrite_files_with_same_name(
207206 mock_uuid .return_value = "abc"
208207
209208 # perform two (2) writes for source="alma" and run_date="2024-12-01"
210- _ = new_dataset .write (sample_records_iter (10 ))
211- written_files_source_a1 = new_dataset .write (sample_records_iter (7 ))
212- new_dataset . reload ()
209+ _ = new_local_dataset .write (sample_records_iter (10 ))
210+ written_files_source_a1 = new_local_dataset .write (sample_records_iter (7 ))
211+ new_local_dataset . load ()
213212
214213 # assert that only the second file exists and overwriting occurs
215214 assert os .path .exists (written_files_source_a1 [0 ].path )
216- assert new_dataset .row_count == 7
215+ assert new_local_dataset .row_count == 7
0 commit comments