Skip to content

Commit 7759a2f

Browse files
committed
test to exercise partition delete on write
1 parent 99e20ec commit 7759a2f

1 file changed

Lines changed: 51 additions & 0 deletions

File tree

tests/test_dataset_write.py

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -175,3 +175,54 @@ def test_dataset_write_schema_applied_to_dataset(new_dataset, sample_records_ite
175175
)
176176

177177
assert set(dataset.schema.names) == set(TIMDEX_DATASET_SCHEMA.names)
178+
179+
180+
def test_dataset_write_partition_deleted_when_written_to_again(
181+
new_dataset, sample_records_iter
182+
):
183+
"""This tests the existing_data_behavior="delete_matching" configuration when writing
184+
to a dataset."""
185+
partition_values = {
186+
"source": "alma",
187+
"run_date": "2024-12-01",
188+
"run_type": "daily",
189+
"action": "index",
190+
"run_id": "000-111-aaa-bbb",
191+
}
192+
193+
# perform FIRST write to run_date="2024-12-01"
194+
written_files_1 = new_dataset.write(
195+
sample_records_iter(10),
196+
partition_values=partition_values,
197+
)
198+
199+
# assert that files from first write are present at this time
200+
assert os.path.exists(written_files_1[0].path)
201+
202+
# perform unrelated write with new run_date to confirm this is untouched during delete
203+
new_partition_values = partition_values.copy()
204+
new_partition_values["run_date"] = "2024-12-15"
205+
new_partition_values["run_id"] = "222-333-ccc-ddd"
206+
written_files_x = new_dataset.write(
207+
sample_records_iter(7),
208+
partition_values=new_partition_values,
209+
)
210+
211+
# perform SECOND write to run_date="2024-12-01", expecting this to delete everything
212+
# under this combination of partitions (i.e. the first write)
213+
written_files_2 = new_dataset.write(
214+
sample_records_iter(10),
215+
partition_values=partition_values,
216+
)
217+
218+
new_dataset.reload()
219+
220+
# assert 17 rows: second write for run_date="2024-12-01" @ 10 rows +
221+
# run_date="2024-12-15" @ 5 rows
222+
assert new_dataset.row_count == 17
223+
224+
# assert that files from first run_date="2024-12-01" are gone, second exist
225+
# and files from run_date="2024-12-15" also exist
226+
assert not os.path.exists(written_files_1[0].path)
227+
assert os.path.exists(written_files_2[0].path)
228+
assert os.path.exists(written_files_x[0].path)

0 commit comments

Comments
 (0)