MITLibraries
diff --git a/‎Pipfile.lock‎
Lines changed: 121 additions & 141 deletions b/‎Pipfile.lock‎
Lines changed: 121 additions & 141 deletions
diff --git a/‎README.md‎
Lines changed: 10 additions & 1 deletion b/‎README.md‎
Lines changed: 10 additions & 1 deletion
diff --git a/‎tests/test_dataset.py‎
Lines changed: 32 additions & 7 deletions b/‎tests/test_dataset.py‎
Lines changed: 32 additions & 7 deletions
diff --git a/‎tests/test_metadata.py‎
Lines changed: 18 additions & 4 deletions b/‎tests/test_metadata.py‎
Lines changed: 18 additions & 4 deletions
diff --git a/‎tests/test_write.py‎
Lines changed: 38 additions & 0 deletions b/‎tests/test_write.py‎
Lines changed: 38 additions & 0 deletions
diff --git a/‎timdex_dataset_api/dataset.py‎
Lines changed: 53 additions & 20 deletions b/‎timdex_dataset_api/dataset.py‎
Lines changed: 53 additions & 20 deletions
@@ -49,7 +49,16 @@ WARNING_ONLY_LOGGERS=# Comma-seperated list of logger names to set as WARNING on
 MINIO_S3_ENDPOINT_URL=# If set, informs the library to use this Minio S3 instance.  Requires the http(s):// protocol. 
 MINIO_USERNAME=# Username / AWS Key for Minio; required when MINIO_S3_ENDPOINT_URL is set
 MINIO_PASSWORD=# Pasword / AWS Secret for Minio; required when MINIO_S3_ENDPOINT_URL is set
-MINIO_DATA=# Path to persist MinIO data if started via Makefile command 
+MINIO_DATA=# Path to persist MinIO data if started via Makefile command
+
+TDA_READ_BATCH_SIZE=# Row size of batches read, affecting memory consumption
+TDA_WRITE_BATCH_SIZE=# Row size of batches written, directly affecting row group size in final parquet files
+TDA_MAX_ROWS_PER_GROUP=# Max number of rows per row group in a parquet file
+TDA_MAX_ROWS_PER_FILE=# Max number of rows in a single parquet file
+TDA_BATCH_READ_AHEAD=# Number of batches to optimistically read ahead when batch reading from a dataset; pyarrow default is 16
+TDA_FRAGMENT_READ_AHEAD=# Number of fragments to optimistically read ahead when batch reaching from a dataset; pyarrow default is 4
+TDA_DUCKDB_MEMORY_LIMIT=# Memory limit for DuckDB connection
+TDA_DUCKDB_THREADS=# Thread limit for DuckDB connection
 ```
 
 ## Local S3 via MinIO
 
@@ -1,5 +1,6 @@
 # ruff: noqa: D205, D209, SLF001, PLR2004
 
+import glob
 import os
 from datetime import date
 from unittest.mock import MagicMock, patch
@@ -18,11 +19,24 @@
 @pytest.mark.parametrize(
     ("location", "expected_file_system", "expected_source"),
     [
-        ("path/to/dataset", fs.LocalFileSystem, "path/to/dataset"),
-        ("s3://bucket/path/to/dataset", fs.S3FileSystem, "bucket/path/to/dataset"),
+        (
+            "path/to/dataset",
+            fs.LocalFileSystem,
+            "path/to/dataset/data/records",
+        ),
+        (
+            "s3://timdex/path/to/dataset",
+            fs.S3FileSystem,
+            "timdex/path/to/dataset/data/records",
+        ),
     ],
 )
-def test_dataset_init_success(location, expected_file_system, expected_source):
+def test_dataset_init_success(
+    location,
+    expected_file_system,
+    expected_source,
+    mocked_timdex_bucket,
+):
     timdex_dataset = TIMDEXDataset(location=location)
     assert isinstance(timdex_dataset.filesystem, expected_file_system)
     assert timdex_dataset.paths == expected_source
@@ -58,7 +72,7 @@ def test_dataset_load_local_sets_filesystem_and_dataset_success(
     result = timdex_dataset.load()
 
     mock_pyarrow_ds.assert_called_once_with(
-        "local/path/to/dataset",
+        "local/path/to/dataset/data/records",
         schema=timdex_dataset.schema,
         format="parquet",
         partitioning="hive",
@@ -72,16 +86,16 @@ def test_dataset_load_local_sets_filesystem_and_dataset_success(
 @patch("timdex_dataset_api.dataset.TIMDEXDataset.get_s3_filesystem")
 @patch("timdex_dataset_api.dataset.ds.dataset")
 def test_dataset_load_s3_sets_filesystem_and_dataset_success(
-    mock_pyarrow_ds, mock_get_s3_fs
+    mock_pyarrow_ds, mock_get_s3_fs, mocked_timdex_bucket
 ):
     mock_get_s3_fs.return_value = MagicMock()
     mock_pyarrow_ds.return_value = MagicMock()
 
-    timdex_dataset = TIMDEXDataset(location="s3://bucket/path/to/dataset")
+    timdex_dataset = TIMDEXDataset(location="s3://timdex/path/to/dataset")
     result = timdex_dataset.load()
 
     mock_pyarrow_ds.assert_called_with(
-        "bucket/path/to/dataset",
+        "timdex/path/to/dataset/data/records",
         schema=timdex_dataset.schema,
         format="parquet",
         partitioning="hive",
@@ -497,3 +511,14 @@ def test_dataset_load_current_records_gets_correct_same_day_daily_runs_ordering(
 
     assert first_record["run_id"] == "run-5"
     assert first_record["action"] == "delete"
+
+
+def test_dataset_records_data_structure_is_idempotent(dataset_with_runs):
+    assert os.path.exists(dataset_with_runs.data_records_root)
+    start_file_count = glob.glob(f"{dataset_with_runs.data_records_root}/**/*")
+
+    dataset_with_runs.create_data_structure()
+
+    assert os.path.exists(dataset_with_runs.data_records_root)
+    end_file_count = glob.glob(f"{dataset_with_runs.data_records_root}/**/*")
+    assert start_file_count == end_file_count
@@ -1,17 +1,20 @@
+import glob
+import os
+from pathlib import Path
+
 from duckdb import DuckDBPyConnection
 
 from timdex_dataset_api import TIMDEXDatasetMetadata
 
 
 def test_tdm_init_no_metadata_file_warning_success(caplog, dataset_with_runs_location):
-    tdm = TIMDEXDatasetMetadata(dataset_with_runs_location)
+    TIMDEXDatasetMetadata(dataset_with_runs_location)
 
-    assert tdm.conn is None
     assert "Static metadata database not found" in caplog.text
 
 
-def test_tdm_local_dataset_structure_properties():
-    local_root = "/path/to/nothing"
+def test_tdm_local_dataset_structure_properties(tmp_path):
+    local_root = str(Path(tmp_path) / "path/to/nothing")
     tdm_local = TIMDEXDatasetMetadata(local_root)
     assert tdm_local.location == local_root
     assert tdm_local.location_scheme == "file"
@@ -44,3 +47,14 @@ def test_tdm_connection_static_database_records_table_exists(timdex_dataset_meta
         """select * from static_db.records;"""
     ).to_df()
     assert len(records_df) > 0
+
+
+def test_dataset_metadata_structure_is_idempotent(timdex_dataset_metadata):
+    assert os.path.exists(timdex_dataset_metadata.metadata_root)
+    start_file_count = glob.glob(f"{timdex_dataset_metadata.metadata_root}/**/*")
+
+    timdex_dataset_metadata.create_metadata_structure()
+
+    assert os.path.exists(timdex_dataset_metadata.metadata_root)
+    end_file_count = glob.glob(f"{timdex_dataset_metadata.metadata_root}/**/*")
+    assert start_file_count == end_file_count
@@ -1,16 +1,19 @@
 # ruff: noqa: PLR2004, D209, D205
 import math
 import os
+from pathlib import Path
 from unittest.mock import patch
 
 import pyarrow.dataset as ds
+import pyarrow.parquet as pq
 import pytest
 
 from tests.utils import generate_sample_records
 from timdex_dataset_api.dataset import (
     TIMDEX_DATASET_SCHEMA,
     TIMDEXDataset,
 )
+from timdex_dataset_api.metadata import ORDERED_METADATA_COLUMN_NAMES
 
 
 def test_dataset_write_records_to_new_local_dataset(
@@ -144,3 +147,38 @@ def test_dataset_write_partition_overwrite_files_with_same_name(
     # assert that only the second file exists and overwriting occurs
     assert os.path.exists(written_files_source_a1[0].path)
     assert new_local_dataset.row_count == 7
+
+
+def test_dataset_write_single_append_delta_success(
+    new_local_dataset, sample_records_iter
+):
+    written_files = new_local_dataset.write(sample_records_iter(1_000))
+    append_deltas = os.listdir(new_local_dataset.metadata.append_deltas_path)
+
+    assert len(append_deltas) == len(written_files)
+
+
+def test_dataset_write_multiple_append_deltas_success(
+    new_local_dataset, sample_records_iter
+):
+    """Expecting 10 ETL parquet files written, and so 10 append deltas as well."""
+    new_local_dataset.config.max_rows_per_file = 100
+    new_local_dataset.config.max_rows_per_group = 100
+
+    written_files = new_local_dataset.write(sample_records_iter(1_000))
+    append_deltas = os.listdir(new_local_dataset.metadata.append_deltas_path)
+
+    assert len(written_files) == 10
+    assert len(append_deltas) == len(written_files)
+
+
+def test_dataset_write_append_delta_expected_metadata_columns(
+    new_local_dataset, sample_records_iter
+):
+    new_local_dataset.write(sample_records_iter(1_000))
+    append_delta_filepath = os.listdir(new_local_dataset.metadata.append_deltas_path)[0]
+
+    append_delta = pq.ParquetFile(
+        new_local_dataset.metadata.append_deltas_path / Path(append_delta_filepath)
+    )
+    assert append_delta.schema.names == ORDERED_METADATA_COLUMN_NAMES
@@ -10,7 +10,9 @@
 from dataclasses import dataclass, field
 from datetime import UTC, date, datetime
 from functools import reduce
-from typing import TYPE_CHECKING, TypedDict, Unpack
+from pathlib import Path
+from typing import TYPE_CHECKING, Literal, TypedDict, Unpack
+from urllib.parse import urlparse
 
 import boto3
 import pandas as pd
@@ -20,6 +22,7 @@
 
 from timdex_dataset_api.config import configure_logger
 from timdex_dataset_api.exceptions import DatasetNotLoadedError
+from timdex_dataset_api.metadata import TIMDEXDatasetMetadata
 
 if TYPE_CHECKING:
     from timdex_dataset_api.record import DatasetRecord  # pragma: nocover
@@ -117,19 +120,38 @@ def __init__(
         self.config = config or TIMDEXDatasetConfig()
         self.location = location
 
+        self.create_data_structure()
+
         # pyarrow dataset
-        self.filesystem, self.paths = self.parse_location(self.location)
+        self.filesystem, self.paths = self.parse_location(self.data_records_root)
         self.dataset: ds.Dataset = None  # type: ignore[assignment]
         self.schema = TIMDEX_DATASET_SCHEMA
         self.partition_columns = TIMDEX_DATASET_PARTITION_COLUMNS
 
-        # writing
-        self._written_files: list[ds.WrittenFile] = None  # type: ignore[assignment]
+        # dataset metadata
+        self.metadata = TIMDEXDatasetMetadata(location)  # type: ignore[arg-type]
+
+    @property
+    def location_scheme(self) -> Literal["file", "s3"]:
+        scheme = urlparse(self.location).scheme  # type: ignore[arg-type]
+        if scheme == "":
+            return "file"
+        if scheme == "s3":
+            return "s3"
+        raise ValueError(f"Location with scheme type '{scheme}' not supported.")
 
     @property
     def data_records_root(self) -> str:
         return f"{self.location.removesuffix('/')}/data/records"  # type: ignore[union-attr]
 
+    def create_data_structure(self) -> None:
+        """Ensure ETL records data structure exists in TIMDEX dataset."""
+        if self.location_scheme == "file":
+            Path(self.data_records_root).mkdir(
+                parents=True,
+                exist_ok=True,
+            )
+
     @property
     def row_count(self) -> int:
         """Get row count from loaded dataset."""
@@ -163,7 +185,7 @@ def load(
         start_time = time.perf_counter()
 
         # reset paths from original location before load
-        _, self.paths = self.parse_location(self.location)
+        _, self.paths = self.parse_location(self.data_records_root)
 
         # perform initial load of full dataset
         self.dataset = self._load_pyarrow_dataset()
@@ -172,7 +194,7 @@ def load(
         self.dataset = self._get_filtered_dataset(**filters)
 
         logger.info(
-            f"Dataset successfully loaded: '{self.location}', "
+            f"Dataset successfully loaded: '{self.data_records_root}', "
             f"{round(time.perf_counter()-start_time, 2)}s"
         )
 
@@ -298,6 +320,7 @@ def get_s3_filesystem() -> fs.FileSystem:
             session_token=credentials.token,
         )
 
+    # NOTE: WIP: this will be heavily reworked in upcoming .load() updates
     @classmethod
     def parse_location(
         cls,
@@ -315,6 +338,7 @@ def parse_location(
             case _:
                 raise TypeError("Location type must be str or list[str].")
 
+    # NOTE: WIP: these will be removed in upcoming .load() updates
     @classmethod
     def _parse_single_location(
         cls, location: str
@@ -328,6 +352,7 @@ def _parse_single_location(
             source = location
         return filesystem, source
 
+    # NOTE: WIP: these will be removed in upcoming .load() updates
     @classmethod
     def _parse_multiple_locations(
         cls, location: list[str]
@@ -348,6 +373,7 @@ def write(
         records_iter: Iterator["DatasetRecord"],
         *,
         use_threads: bool = True,
+        write_append_deltas: bool = True,
     ) -> list[ds.WrittenFile]:
         """Write records to the TIMDEX parquet dataset.
 
@@ -370,25 +396,27 @@ def write(
         Args:
             - records_iter: Iterator of DatasetRecord instances
             - use_threads: boolean if threads should be used for writing
+            - write_append_deltas: boolean if append deltas should be written for records
+                written during write
         """
         start_time = time.perf_counter()
-        self._written_files = []
+        written_files: list[ds.WrittenFile] = []
 
         dataset_filesystem, dataset_path = self.parse_location(self.data_records_root)
         if isinstance(dataset_path, list):
             raise TypeError(
                 "Dataset location must be the root of a single dataset for writing"
             )
 
+        # write ETL parquet records
         record_batches_iter = self.create_record_batches(records_iter)
-
         ds.write_dataset(
             record_batches_iter,
             base_dir=dataset_path,
             basename_template="%s-{i}.parquet" % (str(uuid.uuid4())),  # noqa: UP031
             existing_data_behavior="overwrite_or_ignore",
             filesystem=dataset_filesystem,
-            file_visitor=lambda written_file: self._written_files.append(written_file),  # type: ignore[arg-type]
+            file_visitor=lambda written_file: written_files.append(written_file),  # type: ignore[arg-type]
             format="parquet",
             max_open_files=500,
             max_rows_per_file=self.config.max_rows_per_file,
@@ -399,8 +427,14 @@ def write(
             use_threads=use_threads,
         )
 
-        self.log_write_statistics(start_time)
-        return self._written_files  # type: ignore[return-value]
+        # write metadata append deltas
+        if write_append_deltas:
+            for written_file in written_files:
+                self.metadata.write_append_delta_duckdb(written_file.path)  # type: ignore[attr-defined]
+
+        self.log_write_statistics(start_time, written_files)
+
+        return written_files
 
     def create_record_batches(
         self, records_iter: Iterator["DatasetRecord"]
@@ -423,19 +457,18 @@ def create_record_batches(
             logger.debug(f"Yielding batch {i + 1} for dataset writing.")
             yield batch
 
-    def log_write_statistics(self, start_time: float) -> None:
+    def log_write_statistics(
+        self,
+        start_time: float,
+        written_files: list[ds.WrittenFile],
+    ) -> None:
         """Parse written files from write and log statistics."""
         total_time = round(time.perf_counter() - start_time, 2)
-        total_files = len(self._written_files)
+        total_files = len(written_files)
         total_rows = sum(
-            [
-                wf.metadata.num_rows  # type: ignore[attr-defined]
-                for wf in self._written_files
-            ]
-        )
-        total_size = sum(
-            [wf.size for wf in self._written_files]  # type: ignore[attr-defined]
+            [wf.metadata.num_rows for wf in written_files]  # type: ignore[attr-defined]
         )
+        total_size = sum([wf.size for wf in written_files])  # type: ignore[attr-defined]
         logger.info(
             f"Dataset write complete - elapsed: "
             f"{total_time}s, "