Skip to content

Commit 6b93d88

Browse files
Replace static fixture of local dataset with temp, dynamic dataset
Why these changes are being introduced: * This avoids the creation of artifacts in the 'tests/fixtures/' directory that become outdated if / when partitioning scheme changes. This also updates the 'local_dataset' fixture to use the 'tmp_path' fixture from pytest, allowing each test that uses the 'local_dataset' fixture to run the test in a clean and isolated environment. How this addresses that need: * Update "local_dataset" fixtures * Update unit tests Side effects of this change: * None Relevant ticket(s): * https://mitlibraries.atlassian.net/browse/TIMX-440
1 parent e1c0c6a commit 6b93d88

5 files changed

Lines changed: 64 additions & 58 deletions

File tree

  • tests
    • fixtures/local_datasets/dataset
      • source=alma
        • run_date=2023-03-06/run_type=daily/action=index/run_id=74afc7ba-9bbe-4f52-827d-c0595fa82036
        • run_date=2023-03-07/run_type=daily/action=delete/run_id=b831b653-028d-42eb-bf9f-c9fcdd46a982
      • source=libguides/run_date=2023-08-09/run_type=full/action=index/run_id=e90832e8-399f-476c-9b33-9ebe4120b5ab

tests/conftest.py

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,10 @@
55

66
import pytest
77

8-
from tests.utils import generate_sample_records
8+
from tests.utils import (
9+
generate_sample_records,
10+
generate_sample_records_with_simulated_partitions,
11+
)
912
from timdex_dataset_api import TIMDEXDataset
1013

1114

@@ -19,18 +22,22 @@ def _test_env(monkeypatch):
1922

2023

2124
@pytest.fixture
22-
def local_dataset_location():
23-
return "tests/fixtures/local_datasets/dataset"
25+
def local_dataset_location(tmp_path):
26+
return str(tmp_path / "tests/fixtures/local_datasets/dataset")
2427

2528

2629
@pytest.fixture
2730
def local_dataset(local_dataset_location):
28-
return TIMDEXDataset.load(local_dataset_location)
31+
timdex_dataset = TIMDEXDataset(local_dataset_location)
32+
records = generate_sample_records_with_simulated_partitions(num_records=5_000)
33+
timdex_dataset.write(records)
34+
timdex_dataset.load()
35+
return timdex_dataset
2936

3037

3138
@pytest.fixture
32-
def new_dataset(tmp_path) -> TIMDEXDataset:
33-
location = str(tmp_path / "new_dataset")
39+
def new_local_dataset(tmp_path) -> TIMDEXDataset:
40+
location = str(tmp_path / "new_local_dataset")
3441
return TIMDEXDataset(location=location)
3542

3643

tests/test_dataset_write.py

Lines changed: 51 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,6 @@
1212
from timdex_dataset_api.dataset import (
1313
MAX_ROWS_PER_FILE,
1414
TIMDEX_DATASET_SCHEMA,
15-
DatasetNotLoadedError,
1615
TIMDEXDataset,
1716
)
1817
from timdex_dataset_api.record import DatasetRecord
@@ -30,6 +29,7 @@ def test_dataset_record_init():
3029
"run_id": "000-111-aaa-bbb",
3130
}
3231
record = DatasetRecord(**values)
32+
3333
assert record
3434
assert (record.year, record.month, record.day) == (
3535
"2024",
@@ -49,6 +49,7 @@ def test_dataset_record_init_with_invalid_run_date_raise_error():
4949
"action": "index",
5050
"run_id": "000-111-aaa-bbb",
5151
}
52+
5253
with pytest.raises(
5354
ValueError, match=re.escape("time data '-12-01' does not match format '%Y-%m-%d'")
5455
):
@@ -67,6 +68,7 @@ def test_dataset_record_serialization():
6768
"run_id": "abc123",
6869
}
6970
dataset_record = DatasetRecord(**values)
71+
7072
assert dataset_record.to_dict() == {
7173
"timdex_record_id": "alma:123",
7274
"source_record": b"<record><title>Hello World.</title></record>",
@@ -82,47 +84,38 @@ def test_dataset_record_serialization():
8284
}
8385

8486

85-
def test_dataset_write_records_to_new_dataset(new_dataset, sample_records_iter):
86-
files_written = new_dataset.write(sample_records_iter(10_000))
87-
assert len(files_written) == 1
88-
assert os.path.exists(new_dataset.location)
89-
90-
# load newly created dataset as new TIMDEXDataset instance
91-
dataset = TIMDEXDataset.load(new_dataset.location)
92-
assert dataset.row_count == 10_000
93-
94-
95-
def test_dataset_reload_after_write(new_dataset, sample_records_iter):
96-
files_written = new_dataset.write(sample_records_iter(10_000))
97-
assert len(files_written) == 1
98-
assert os.path.exists(new_dataset.location)
99-
100-
# attempt row count before reload
101-
with pytest.raises(DatasetNotLoadedError):
102-
_ = new_dataset.row_count
87+
def test_dataset_write_records_to_new_local_dataset(
88+
new_local_dataset, sample_records_iter
89+
):
90+
written_files = new_local_dataset.write(sample_records_iter(10_000))
91+
new_local_dataset.load()
10392

104-
# attempt row count after reload
105-
new_dataset.reload()
106-
assert new_dataset.row_count == 10_000
93+
assert len(written_files) == 1
94+
assert os.path.exists(new_local_dataset.location)
95+
assert new_local_dataset.row_count == 10_000
10796

10897

109-
def test_dataset_write_default_max_rows_per_file(new_dataset, sample_records_iter):
98+
def test_dataset_write_default_max_rows_per_file(new_local_dataset, sample_records_iter):
11099
"""Default is 100k rows per file, therefore writing 200,033 records should result in
111100
3 files (x2 @ 100k rows, x1 @ 33 rows)."""
112101
total_records = 200_033
113102

114-
new_dataset.write(sample_records_iter(total_records))
115-
new_dataset.reload()
103+
new_local_dataset.write(sample_records_iter(total_records))
104+
new_local_dataset.load()
116105

117-
assert new_dataset.row_count == total_records
118-
assert len(new_dataset.dataset.files) == math.ceil(total_records / MAX_ROWS_PER_FILE)
106+
assert new_local_dataset.row_count == total_records
107+
assert len(new_local_dataset.dataset.files) == math.ceil(
108+
total_records / MAX_ROWS_PER_FILE
109+
)
119110

120111

121-
def test_dataset_write_record_batches_uses_batch_size(new_dataset, sample_records_iter):
112+
def test_dataset_write_record_batches_uses_batch_size(
113+
new_local_dataset, sample_records_iter
114+
):
122115
total_records = 101
123116
batch_size = 50
124117
batches = list(
125-
new_dataset.get_dataset_record_batches(
118+
new_local_dataset.get_dataset_record_batches(
126119
sample_records_iter(total_records), batch_size=batch_size
127120
)
128121
)
@@ -140,63 +133,69 @@ def test_dataset_write_to_multiple_locations_raise_error(sample_records_iter):
140133
timdex_dataset.write(sample_records_iter(10))
141134

142135

143-
def test_dataset_write_schema_applied_to_dataset(new_dataset, sample_records_iter):
144-
new_dataset.write(sample_records_iter(10))
136+
def test_dataset_write_schema_applied_to_dataset(new_local_dataset, sample_records_iter):
137+
new_local_dataset.write(sample_records_iter(10))
145138

146139
# manually load dataset to confirm schema without TIMDEXDataset projecting schema
147140
# during load
148141
dataset = ds.dataset(
149-
new_dataset.location,
142+
new_local_dataset.location,
150143
format="parquet",
151144
partitioning="hive",
152145
)
153146

154147
assert set(dataset.schema.names) == set(TIMDEX_DATASET_SCHEMA.names)
155148

156149

157-
def test_dataset_write_partition_for_single_source(new_dataset, sample_records_iter):
158-
written_files = new_dataset.write(sample_records_iter(10))
150+
def test_dataset_write_partition_for_single_source(
151+
new_local_dataset, sample_records_iter
152+
):
153+
written_files = new_local_dataset.write(sample_records_iter(10))
159154
assert len(written_files) == 1
160-
assert os.path.exists(new_dataset.location)
155+
assert os.path.exists(new_local_dataset.location)
161156
assert "year=2024/month=12/day=01" in written_files[0].path
162157

163158

164-
def test_dataset_write_partition_for_multiple_sources(new_dataset, sample_records_iter):
159+
def test_dataset_write_partition_for_multiple_sources(
160+
new_local_dataset, sample_records_iter
161+
):
165162
# perform write for source="alma" and run_date="2024-12-01"
166-
written_files_source_a = new_dataset.write(sample_records_iter(10))
167-
new_dataset.reload()
163+
written_files_source_a = new_local_dataset.write(sample_records_iter(10))
164+
new_local_dataset.load()
168165

169166
assert os.path.exists(written_files_source_a[0].path)
170-
assert new_dataset.row_count == 10
167+
assert new_local_dataset.row_count == 10
171168

172169
# perform write for source="libguides" and run_date="2024-12-01"
173-
written_files_source_b = new_dataset.write(
170+
written_files_source_b = new_local_dataset.write(
174171
generate_sample_records(
175172
num_records=7, timdex_record_id_prefix="libguides", source="libguides"
176173
)
177174
)
178-
new_dataset.reload()
175+
new_local_dataset.load()
179176

180177
assert os.path.exists(written_files_source_b[0].path)
181178
assert os.path.exists(written_files_source_a[0].path)
182-
assert new_dataset.row_count == 17
179+
assert new_local_dataset.row_count == 17
183180

184181

185-
def test_dataset_write_partition_ignore_existing_data(new_dataset, sample_records_iter):
182+
def test_dataset_write_partition_ignore_existing_data(
183+
new_local_dataset, sample_records_iter
184+
):
186185
# perform two (2) writes for source="alma" and run_date="2024-12-01"
187-
written_files_source_a0 = new_dataset.write(sample_records_iter(10))
188-
written_files_source_a1 = new_dataset.write(sample_records_iter(10))
189-
new_dataset.reload()
186+
written_files_source_a0 = new_local_dataset.write(sample_records_iter(10))
187+
written_files_source_a1 = new_local_dataset.write(sample_records_iter(10))
188+
new_local_dataset.load()
190189

191190
# assert that both files exist and no overwriting occurs
192191
assert os.path.exists(written_files_source_a0[0].path)
193192
assert os.path.exists(written_files_source_a1[0].path)
194-
assert new_dataset.row_count == 20
193+
assert new_local_dataset.row_count == 20
195194

196195

197196
@patch("timdex_dataset_api.dataset.uuid.uuid4")
198197
def test_dataset_write_partition_overwrite_files_with_same_name(
199-
mock_uuid, new_dataset, sample_records_iter
198+
mock_uuid, new_local_dataset, sample_records_iter
200199
):
201200
"""This test is to demonstrate existing_data_behavior="overwrite_or_ignore".
202201
@@ -207,10 +206,10 @@ def test_dataset_write_partition_overwrite_files_with_same_name(
207206
mock_uuid.return_value = "abc"
208207

209208
# perform two (2) writes for source="alma" and run_date="2024-12-01"
210-
_ = new_dataset.write(sample_records_iter(10))
211-
written_files_source_a1 = new_dataset.write(sample_records_iter(7))
212-
new_dataset.reload()
209+
_ = new_local_dataset.write(sample_records_iter(10))
210+
written_files_source_a1 = new_local_dataset.write(sample_records_iter(7))
211+
new_local_dataset.load()
213212

214213
# assert that only the second file exists and overwriting occurs
215214
assert os.path.exists(written_files_source_a1[0].path)
216-
assert new_dataset.row_count == 7
215+
assert new_local_dataset.row_count == 7

0 commit comments

Comments
 (0)