1010from dataclasses import dataclass , field
1111from datetime import UTC , date , datetime
1212from functools import reduce
13- from typing import TYPE_CHECKING , TypedDict , Unpack
13+ from pathlib import Path
14+ from typing import TYPE_CHECKING , Literal , TypedDict , Unpack
15+ from urllib .parse import urlparse
1416
1517import boto3
1618import pandas as pd
2022
2123from timdex_dataset_api .config import configure_logger
2224from timdex_dataset_api .exceptions import DatasetNotLoadedError
25+ from timdex_dataset_api .metadata import TIMDEXDatasetMetadata
2326
2427if TYPE_CHECKING :
2528 from timdex_dataset_api .record import DatasetRecord # pragma: nocover
@@ -117,19 +120,38 @@ def __init__(
117120 self .config = config or TIMDEXDatasetConfig ()
118121 self .location = location
119122
123+ self .create_data_structure ()
124+
120125 # pyarrow dataset
121- self .filesystem , self .paths = self .parse_location (self .location )
126+ self .filesystem , self .paths = self .parse_location (self .data_records_root )
122127 self .dataset : ds .Dataset = None # type: ignore[assignment]
123128 self .schema = TIMDEX_DATASET_SCHEMA
124129 self .partition_columns = TIMDEX_DATASET_PARTITION_COLUMNS
125130
126- # writing
127- self ._written_files : list [ds .WrittenFile ] = None # type: ignore[assignment]
131+ # dataset metadata
132+ self .metadata = TIMDEXDatasetMetadata (location ) # type: ignore[arg-type]
133+
134+ @property
135+ def location_scheme (self ) -> Literal ["file" , "s3" ]:
136+ scheme = urlparse (self .location ).scheme # type: ignore[arg-type]
137+ if scheme == "" :
138+ return "file"
139+ if scheme == "s3" :
140+ return "s3"
141+ raise ValueError (f"Location with scheme type '{ scheme } ' not supported." )
128142
129143 @property
130144 def data_records_root (self ) -> str :
131145 return f"{ self .location .removesuffix ('/' )} /data/records" # type: ignore[union-attr]
132146
147+ def create_data_structure (self ) -> None :
148+ """Ensure ETL records data structure exists in TIMDEX dataset."""
149+ if self .location_scheme == "file" :
150+ Path (self .data_records_root ).mkdir (
151+ parents = True ,
152+ exist_ok = True ,
153+ )
154+
133155 @property
134156 def row_count (self ) -> int :
135157 """Get row count from loaded dataset."""
@@ -163,7 +185,7 @@ def load(
163185 start_time = time .perf_counter ()
164186
165187 # reset paths from original location before load
166- _ , self .paths = self .parse_location (self .location )
188+ _ , self .paths = self .parse_location (self .data_records_root )
167189
168190 # perform initial load of full dataset
169191 self .dataset = self ._load_pyarrow_dataset ()
@@ -172,7 +194,7 @@ def load(
172194 self .dataset = self ._get_filtered_dataset (** filters )
173195
174196 logger .info (
175- f"Dataset successfully loaded: '{ self .location } ', "
197+ f"Dataset successfully loaded: '{ self .data_records_root } ', "
176198 f"{ round (time .perf_counter ()- start_time , 2 )} s"
177199 )
178200
@@ -298,6 +320,7 @@ def get_s3_filesystem() -> fs.FileSystem:
298320 session_token = credentials .token ,
299321 )
300322
323+ # NOTE: WIP: this will be heavily reworked in upcoming .load() updates
301324 @classmethod
302325 def parse_location (
303326 cls ,
@@ -315,6 +338,7 @@ def parse_location(
315338 case _:
316339 raise TypeError ("Location type must be str or list[str]." )
317340
341+ # NOTE: WIP: these will be removed in upcoming .load() updates
318342 @classmethod
319343 def _parse_single_location (
320344 cls , location : str
@@ -328,6 +352,7 @@ def _parse_single_location(
328352 source = location
329353 return filesystem , source
330354
355+ # NOTE: WIP: these will be removed in upcoming .load() updates
331356 @classmethod
332357 def _parse_multiple_locations (
333358 cls , location : list [str ]
@@ -348,6 +373,7 @@ def write(
348373 records_iter : Iterator ["DatasetRecord" ],
349374 * ,
350375 use_threads : bool = True ,
376+ write_append_deltas : bool = True ,
351377 ) -> list [ds .WrittenFile ]:
352378 """Write records to the TIMDEX parquet dataset.
353379
@@ -370,25 +396,27 @@ def write(
370396 Args:
371397 - records_iter: Iterator of DatasetRecord instances
372398 - use_threads: boolean if threads should be used for writing
399+ - write_append_deltas: boolean if append deltas should be written for records
400+ written during write
373401 """
374402 start_time = time .perf_counter ()
375- self . _written_files = []
403+ written_files : list [ ds . WrittenFile ] = []
376404
377405 dataset_filesystem , dataset_path = self .parse_location (self .data_records_root )
378406 if isinstance (dataset_path , list ):
379407 raise TypeError (
380408 "Dataset location must be the root of a single dataset for writing"
381409 )
382410
411+ # write ETL parquet records
383412 record_batches_iter = self .create_record_batches (records_iter )
384-
385413 ds .write_dataset (
386414 record_batches_iter ,
387415 base_dir = dataset_path ,
388416 basename_template = "%s-{i}.parquet" % (str (uuid .uuid4 ())), # noqa: UP031
389417 existing_data_behavior = "overwrite_or_ignore" ,
390418 filesystem = dataset_filesystem ,
391- file_visitor = lambda written_file : self . _written_files .append (written_file ), # type: ignore[arg-type]
419+ file_visitor = lambda written_file : written_files .append (written_file ), # type: ignore[arg-type]
392420 format = "parquet" ,
393421 max_open_files = 500 ,
394422 max_rows_per_file = self .config .max_rows_per_file ,
@@ -399,8 +427,14 @@ def write(
399427 use_threads = use_threads ,
400428 )
401429
402- self .log_write_statistics (start_time )
403- return self ._written_files # type: ignore[return-value]
430+ # write metadata append deltas
431+ if write_append_deltas :
432+ for written_file in written_files :
433+ self .metadata .write_append_delta_duckdb (written_file .path ) # type: ignore[attr-defined]
434+
435+ self .log_write_statistics (start_time , written_files )
436+
437+ return written_files
404438
405439 def create_record_batches (
406440 self , records_iter : Iterator ["DatasetRecord" ]
@@ -423,19 +457,18 @@ def create_record_batches(
423457 logger .debug (f"Yielding batch { i + 1 } for dataset writing." )
424458 yield batch
425459
426- def log_write_statistics (self , start_time : float ) -> None :
460+ def log_write_statistics (
461+ self ,
462+ start_time : float ,
463+ written_files : list [ds .WrittenFile ],
464+ ) -> None :
427465 """Parse written files from write and log statistics."""
428466 total_time = round (time .perf_counter () - start_time , 2 )
429- total_files = len (self . _written_files )
467+ total_files = len (written_files )
430468 total_rows = sum (
431- [
432- wf .metadata .num_rows # type: ignore[attr-defined]
433- for wf in self ._written_files
434- ]
435- )
436- total_size = sum (
437- [wf .size for wf in self ._written_files ] # type: ignore[attr-defined]
469+ [wf .metadata .num_rows for wf in written_files ] # type: ignore[attr-defined]
438470 )
471+ total_size = sum ([wf .size for wf in written_files ]) # type: ignore[attr-defined]
439472 logger .info (
440473 f"Dataset write complete - elapsed: "
441474 f"{ total_time } s, "
0 commit comments