@@ -59,70 +59,66 @@ def test_dataset_record_serialization_with_partition_values_provided():
5959 }
6060
6161
62- def test_dataset_write_records_to_new_dataset (new_temp_dataset , small_records_iter ):
63- files_written = new_temp_dataset .write (small_records_iter (10_000 ))
62+ def test_dataset_write_records_to_new_dataset (new_dataset , sample_records_iter ):
63+ files_written = new_dataset .write (sample_records_iter (10_000 ))
6464 assert len (files_written ) == 1
65- assert os .path .exists (new_temp_dataset .location )
65+ assert os .path .exists (new_dataset .location )
6666
6767 # load newly created dataset as new TIMDEXDataset instance
68- dataset = TIMDEXDataset .load (new_temp_dataset .location )
68+ dataset = TIMDEXDataset .load (new_dataset .location )
6969 assert dataset .row_count == 10_000
7070
7171
72- def test_dataset_reload_after_write (new_temp_dataset , small_records_iter ):
73- files_written = new_temp_dataset .write (small_records_iter (10_000 ))
72+ def test_dataset_reload_after_write (new_dataset , sample_records_iter ):
73+ files_written = new_dataset .write (sample_records_iter (10_000 ))
7474 assert len (files_written ) == 1
75- assert os .path .exists (new_temp_dataset .location )
75+ assert os .path .exists (new_dataset .location )
7676
7777 # attempt row count before reload
7878 with pytest .raises (DatasetNotLoadedError ):
79- _ = new_temp_dataset .row_count
79+ _ = new_dataset .row_count
8080
8181 # attempt row count after reload
82- new_temp_dataset .reload ()
83- assert new_temp_dataset .row_count == 10_000
82+ new_dataset .reload ()
83+ assert new_dataset .row_count == 10_000
8484
8585
86- def test_dataset_write_default_max_rows_per_file (new_temp_dataset , small_records_iter ):
86+ def test_dataset_write_default_max_rows_per_file (new_dataset , sample_records_iter ):
8787 """Default is 100k rows per file, therefore writing 200,033 records should result in
8888 3 files (x2 @ 100k rows, x1 @ 33 rows)."""
8989 total_records = 200_033
9090
91- new_temp_dataset .write (small_records_iter (total_records ))
92- new_temp_dataset .reload ()
91+ new_dataset .write (sample_records_iter (total_records ))
92+ new_dataset .reload ()
9393
94- assert new_temp_dataset .row_count == total_records
95- assert len (new_temp_dataset .dataset .files ) == math .ceil (
96- total_records / MAX_ROWS_PER_FILE
97- )
94+ assert new_dataset .row_count == total_records
95+ assert len (new_dataset .dataset .files ) == math .ceil (total_records / MAX_ROWS_PER_FILE )
9896
9997
100- def test_dataset_write_record_batches_uses_batch_size (
101- new_temp_dataset , small_records_iter
102- ):
98+ def test_dataset_write_record_batches_uses_batch_size (new_dataset , sample_records_iter ):
10399 total_records = 101
104100 batch_size = 50
105101 batches = list (
106- new_temp_dataset .get_dataset_record_batches (
107- small_records_iter (total_records ), batch_size = batch_size
102+ new_dataset .get_dataset_record_batches (
103+ sample_records_iter (total_records ), batch_size = batch_size
108104 )
109105 )
110106 assert len (batches ) == math .ceil (total_records / batch_size )
111107
112108
113- def test_dataset_write_to_multiple_locations_raise_error (small_records_iter ):
109+ def test_dataset_write_to_multiple_locations_raise_error (sample_records_iter ):
114110 timdex_dataset = TIMDEXDataset (
115111 location = ["/path/to/records-1.parquet" , "/path/to/records-2.parquet" ]
116112 )
117113 with pytest .raises (
118114 TypeError ,
119115 match = "Dataset location must be the root of a single dataset for writing" ,
120116 ):
121- timdex_dataset .write (small_records_iter (10 ))
117+ timdex_dataset .write (sample_records_iter (10 ))
122118
123119
124120def test_dataset_write_mixin_partition_values_used (
125- new_temp_dataset , small_records_iter_without_partitions
121+ new_dataset , sample_records_iter_without_partitions
126122):
127123 partition_values = {
128124 "source" : "alma" ,
@@ -131,14 +127,14 @@ def test_dataset_write_mixin_partition_values_used(
131127 "action" : "index" ,
132128 "run_id" : "000-111-aaa-bbb" ,
133129 }
134- _written_files = new_temp_dataset .write (
135- small_records_iter_without_partitions (10 ),
130+ _written_files = new_dataset .write (
131+ sample_records_iter_without_partitions (10 ),
136132 partition_values = partition_values ,
137133 )
138- new_temp_dataset .reload ()
134+ new_dataset .reload ()
139135
140136 # load as pandas dataframe and assert column values
141- df = new_temp_dataset .dataset .to_table ().to_pandas ()
137+ df = new_dataset .dataset .to_table ().to_pandas ()
142138 row = df .iloc [0 ]
143139 assert row .source == partition_values ["source" ]
144140 assert row .run_date == datetime .date (2024 , 12 , 1 )
@@ -148,10 +144,10 @@ def test_dataset_write_mixin_partition_values_used(
148144
149145
150146def test_dataset_write_schema_partitions_correctly_ordered (
151- new_temp_dataset , small_records_iter
147+ new_dataset , sample_records_iter
152148):
153- written_files = new_temp_dataset .write (
154- small_records_iter (10 ),
149+ written_files = new_dataset .write (
150+ sample_records_iter (10 ),
155151 partition_values = {
156152 "source" : "alma" ,
157153 "run_date" : "2024-12-01" ,
@@ -167,13 +163,13 @@ def test_dataset_write_schema_partitions_correctly_ordered(
167163 )
168164
169165
170- def test_dataset_write_schema_applied_to_dataset (new_temp_dataset , small_records_iter ):
171- new_temp_dataset .write (small_records_iter (10 ))
166+ def test_dataset_write_schema_applied_to_dataset (new_dataset , sample_records_iter ):
167+ new_dataset .write (sample_records_iter (10 ))
172168
173169 # manually load dataset to confirm schema without TIMDEXDataset projecting schema
174170 # during load
175171 dataset = ds .dataset (
176- new_temp_dataset .location ,
172+ new_dataset .location ,
177173 format = "parquet" ,
178174 partitioning = "hive" ,
179175 )
0 commit comments