Write append deltas on ETL records data write

ghukill · ghukill · commit d4931d5dc2f9 · 2025-08-06T13:59:53.000-04:00
Why these changes are being introduced: With the new metadata approach, an important component are "append deltas". These are standalone parquet files that contain metadata about the records added to the ETL records data parquet files. These eventually are merged into the main static metadata file, but until then are needed for metadata queries. How this addresses that need: During TIMDEXDataset.write(), after each ETL parquet file is written we lean on new method TIMDEXDatasetMetadata.write_append_delta_duckdb() to read metadata from that file and write a new append delta parquet file. This is performed entirely in a DuckDB context, allowing for simple column selection. Side effects of this change: * During dataset write, append deltas will be created. Relevant ticket(s): * https://mitlibraries.atlassian.net/browse/TIMX-527
diff --git a/README.md b/README.md
@@ -49,7 +49,16 @@ WARNING_ONLY_LOGGERS=# Comma-seperated list of logger names to set as WARNING on
 MINIO_S3_ENDPOINT_URL=# If set, informs the library to use this Minio S3 instance.  Requires the http(s):// protocol. 
 MINIO_USERNAME=# Username / AWS Key for Minio; required when MINIO_S3_ENDPOINT_URL is set
 MINIO_PASSWORD=# Pasword / AWS Secret for Minio; required when MINIO_S3_ENDPOINT_URL is set
-MINIO_DATA=# Path to persist MinIO data if started via Makefile command 
+MINIO_DATA=# Path to persist MinIO data if started via Makefile command
+
+TDA_READ_BATCH_SIZE=# Row size of batches read, affecting memory consumption
+TDA_WRITE_BATCH_SIZE=# Row size of batches written, directly affecting row group size in final parquet files
+TDA_MAX_ROWS_PER_GROUP=# Max number of rows per row group in a parquet file
+TDA_MAX_ROWS_PER_FILE=# Max number of rows in a single parquet file
+TDA_BATCH_READ_AHEAD=# Number of batches to optimistically read ahead when batch reading from a dataset; pyarrow default is 16
+TDA_FRAGMENT_READ_AHEAD=# Number of fragments to optimistically read ahead when batch reaching from a dataset; pyarrow default is 4
+TDA_DUCKDB_MEMORY_LIMIT=# Memory limit for DuckDB connection
+TDA_DUCKDB_THREADS=# Thread limit for DuckDB connection
 ```
 
 ## Local S3 via MinIO
diff --git a/tests/test_dataset.py b/tests/test_dataset.py
@@ -1,5 +1,6 @@
 # ruff: noqa: D205, D209, SLF001, PLR2004
 
+import glob
 import os
 from datetime import date
 from unittest.mock import MagicMock, patch
@@ -18,11 +19,24 @@
 @pytest.mark.parametrize(
     ("location", "expected_file_system", "expected_source"),
     [
-        ("path/to/dataset", fs.LocalFileSystem, "path/to/dataset"),
-        ("s3://bucket/path/to/dataset", fs.S3FileSystem, "bucket/path/to/dataset"),
+        (
+            "path/to/dataset",
+            fs.LocalFileSystem,
+            "path/to/dataset/data/records",
+        ),
+        (
+            "s3://timdex/path/to/dataset",
+            fs.S3FileSystem,
+            "timdex/path/to/dataset/data/records",
+        ),
     ],
 )
-def test_dataset_init_success(location, expected_file_system, expected_source):
+def test_dataset_init_success(
+    location,
+    expected_file_system,
+    expected_source,
+    mocked_timdex_bucket,
+):
     timdex_dataset = TIMDEXDataset(location=location)
     assert isinstance(timdex_dataset.filesystem, expected_file_system)
     assert timdex_dataset.paths == expected_source
@@ -58,7 +72,7 @@ def test_dataset_load_local_sets_filesystem_and_dataset_success(
     result = timdex_dataset.load()
 
     mock_pyarrow_ds.assert_called_once_with(
-        "local/path/to/dataset",
+        "local/path/to/dataset/data/records",
         schema=timdex_dataset.schema,
         format="parquet",
         partitioning="hive",
@@ -72,16 +86,16 @@ def test_dataset_load_local_sets_filesystem_and_dataset_success(
 @patch("timdex_dataset_api.dataset.TIMDEXDataset.get_s3_filesystem")
 @patch("timdex_dataset_api.dataset.ds.dataset")
 def test_dataset_load_s3_sets_filesystem_and_dataset_success(
-    mock_pyarrow_ds, mock_get_s3_fs
+    mock_pyarrow_ds, mock_get_s3_fs, mocked_timdex_bucket
 ):
     mock_get_s3_fs.return_value = MagicMock()
     mock_pyarrow_ds.return_value = MagicMock()
 
-    timdex_dataset = TIMDEXDataset(location="s3://bucket/path/to/dataset")
+    timdex_dataset = TIMDEXDataset(location="s3://timdex/path/to/dataset")
     result = timdex_dataset.load()
 
     mock_pyarrow_ds.assert_called_with(
-        "bucket/path/to/dataset",
+        "timdex/path/to/dataset/data/records",
         schema=timdex_dataset.schema,
         format="parquet",
         partitioning="hive",
@@ -497,3 +511,14 @@ def test_dataset_load_current_records_gets_correct_same_day_daily_runs_ordering(
 
     assert first_record["run_id"] == "run-5"
     assert first_record["action"] == "delete"
+
+
+def test_dataset_records_data_structure_is_idempotent(dataset_with_runs):
+    assert os.path.exists(dataset_with_runs.data_records_root)
+    start_file_count = glob.glob(f"{dataset_with_runs.data_records_root}/**/*")
+
+    dataset_with_runs.create_data_structure()
+
+    assert os.path.exists(dataset_with_runs.data_records_root)
+    end_file_count = glob.glob(f"{dataset_with_runs.data_records_root}/**/*")
+    assert start_file_count == end_file_count
diff --git a/tests/test_metadata.py b/tests/test_metadata.py
@@ -1,17 +1,20 @@
+import glob
+import os
+from pathlib import Path
+
 from duckdb import DuckDBPyConnection
 
 from timdex_dataset_api import TIMDEXDatasetMetadata
 
 
 def test_tdm_init_no_metadata_file_warning_success(caplog, dataset_with_runs_location):
-    tdm = TIMDEXDatasetMetadata(dataset_with_runs_location)
+    TIMDEXDatasetMetadata(dataset_with_runs_location)
 
-    assert tdm.conn is None
     assert "Static metadata database not found" in caplog.text
 
 
-def test_tdm_local_dataset_structure_properties():
-    local_root = "/path/to/nothing"
+def test_tdm_local_dataset_structure_properties(tmp_path):
+    local_root = str(Path(tmp_path) / "path/to/nothing")
     tdm_local = TIMDEXDatasetMetadata(local_root)
     assert tdm_local.location == local_root
     assert tdm_local.location_scheme == "file"
@@ -44,3 +47,14 @@ def test_tdm_connection_static_database_records_table_exists(timdex_dataset_meta
         """select * from static_db.records;"""
     ).to_df()
     assert len(records_df) > 0
+
+
+def test_dataset_metadata_structure_is_idempotent(timdex_dataset_metadata):
+    assert os.path.exists(timdex_dataset_metadata.metadata_root)
+    start_file_count = glob.glob(f"{timdex_dataset_metadata.metadata_root}/**/*")
+
+    timdex_dataset_metadata.create_metadata_structure()
+
+    assert os.path.exists(timdex_dataset_metadata.metadata_root)
+    end_file_count = glob.glob(f"{timdex_dataset_metadata.metadata_root}/**/*")
+    assert start_file_count == end_file_count
diff --git a/tests/test_write.py b/tests/test_write.py
@@ -1,16 +1,19 @@
 # ruff: noqa: PLR2004, D209, D205
 import math
 import os
+from pathlib import Path
 from unittest.mock import patch
 
 import pyarrow.dataset as ds
+import pyarrow.parquet as pq
 import pytest
 
 from tests.utils import generate_sample_records
 from timdex_dataset_api.dataset import (
     TIMDEX_DATASET_SCHEMA,
     TIMDEXDataset,
 )
+from timdex_dataset_api.metadata import ORDERED_METADATA_COLUMN_NAMES
 
 
 def test_dataset_write_records_to_new_local_dataset(
@@ -144,3 +147,38 @@ def test_dataset_write_partition_overwrite_files_with_same_name(
     # assert that only the second file exists and overwriting occurs
     assert os.path.exists(written_files_source_a1[0].path)
     assert new_local_dataset.row_count == 7
+
+
+def test_dataset_write_single_append_delta_success(
+    new_local_dataset, sample_records_iter
+):
+    written_files = new_local_dataset.write(sample_records_iter(1_000))
+    append_deltas = os.listdir(new_local_dataset.metadata.append_deltas_path)
+
+    assert len(append_deltas) == len(written_files)
+
+
+def test_dataset_write_multiple_append_deltas_success(
+    new_local_dataset, sample_records_iter
+):
+    """Expecting 10 ETL parquet files written, and so 10 append deltas as well."""
+    new_local_dataset.config.max_rows_per_file = 100
+    new_local_dataset.config.max_rows_per_group = 100
+
+    written_files = new_local_dataset.write(sample_records_iter(1_000))
+    append_deltas = os.listdir(new_local_dataset.metadata.append_deltas_path)
+
+    assert len(written_files) == 10
+    assert len(append_deltas) == len(written_files)
+
+
+def test_dataset_write_append_delta_expected_metadata_columns(
+    new_local_dataset, sample_records_iter
+):
+    new_local_dataset.write(sample_records_iter(1_000))
+    append_delta_filepath = os.listdir(new_local_dataset.metadata.append_deltas_path)[0]
+
+    append_delta = pq.ParquetFile(
+        new_local_dataset.metadata.append_deltas_path / Path(append_delta_filepath)
+    )
+    assert append_delta.schema.names == ORDERED_METADATA_COLUMN_NAMES
diff --git a/timdex_dataset_api/dataset.py b/timdex_dataset_api/dataset.py
@@ -10,7 +10,9 @@
 from dataclasses import dataclass, field
 from datetime import UTC, date, datetime
 from functools import reduce
-from typing import TYPE_CHECKING, TypedDict, Unpack
+from pathlib import Path
+from typing import TYPE_CHECKING, Literal, TypedDict, Unpack
+from urllib.parse import urlparse
 
 import boto3
 import pandas as pd
@@ -20,6 +22,7 @@
 
 from timdex_dataset_api.config import configure_logger
 from timdex_dataset_api.exceptions import DatasetNotLoadedError
+from timdex_dataset_api.metadata import TIMDEXDatasetMetadata
 
 if TYPE_CHECKING:
     from timdex_dataset_api.record import DatasetRecord  # pragma: nocover
@@ -117,19 +120,38 @@ def __init__(
         self.config = config or TIMDEXDatasetConfig()
         self.location = location
 
+        self.create_data_structure()
+
         # pyarrow dataset
-        self.filesystem, self.paths = self.parse_location(self.location)
+        self.filesystem, self.paths = self.parse_location(self.data_records_root)
         self.dataset: ds.Dataset = None  # type: ignore[assignment]
         self.schema = TIMDEX_DATASET_SCHEMA
         self.partition_columns = TIMDEX_DATASET_PARTITION_COLUMNS
 
-        # writing
-        self._written_files: list[ds.WrittenFile] = None  # type: ignore[assignment]
+        # dataset metadata
+        self.metadata = TIMDEXDatasetMetadata(location)  # type: ignore[arg-type]
+
+    @property
+    def location_scheme(self) -> Literal["file", "s3"]:
+        scheme = urlparse(self.location).scheme  # type: ignore[arg-type]
+        if scheme == "":
+            return "file"
+        if scheme == "s3":
+            return "s3"
+        raise ValueError(f"Location with scheme type '{scheme}' not supported.")
 
     @property
     def data_records_root(self) -> str:
         return f"{self.location.removesuffix('/')}/data/records"  # type: ignore[union-attr]
 
+    def create_data_structure(self) -> None:
+        """Ensure ETL records data structure exists in TIMDEX dataset."""
+        if self.location_scheme == "file":
+            Path(self.data_records_root).mkdir(
+                parents=True,
+                exist_ok=True,
+            )
+
     @property
     def row_count(self) -> int:
         """Get row count from loaded dataset."""
@@ -163,7 +185,7 @@ def load(
         start_time = time.perf_counter()
 
         # reset paths from original location before load
-        _, self.paths = self.parse_location(self.location)
+        _, self.paths = self.parse_location(self.data_records_root)
 
         # perform initial load of full dataset
         self.dataset = self._load_pyarrow_dataset()
@@ -172,7 +194,7 @@ def load(
         self.dataset = self._get_filtered_dataset(**filters)
 
         logger.info(
-            f"Dataset successfully loaded: '{self.location}', "
+            f"Dataset successfully loaded: '{self.data_records_root}', "
             f"{round(time.perf_counter()-start_time, 2)}s"
         )
 
@@ -298,6 +320,7 @@ def get_s3_filesystem() -> fs.FileSystem:
             session_token=credentials.token,
         )
 
+    # NOTE: WIP: this will be heavily reworked in upcoming .load() updates
     @classmethod
     def parse_location(
         cls,
@@ -315,6 +338,7 @@ def parse_location(
             case _:
                 raise TypeError("Location type must be str or list[str].")
 
+    # NOTE: WIP: these will be removed in upcoming .load() updates
     @classmethod
     def _parse_single_location(
         cls, location: str
@@ -328,6 +352,7 @@ def _parse_single_location(
             source = location
         return filesystem, source
 
+    # NOTE: WIP: these will be removed in upcoming .load() updates
     @classmethod
     def _parse_multiple_locations(
         cls, location: list[str]
@@ -348,6 +373,7 @@ def write(
         records_iter: Iterator["DatasetRecord"],
         *,
         use_threads: bool = True,
+        write_append_deltas: bool = True,
     ) -> list[ds.WrittenFile]:
         """Write records to the TIMDEX parquet dataset.
 
@@ -370,25 +396,27 @@ def write(
         Args:
             - records_iter: Iterator of DatasetRecord instances
             - use_threads: boolean if threads should be used for writing
+            - write_append_deltas: boolean if append deltas should be written for records
+                written during write
         """
         start_time = time.perf_counter()
-        self._written_files = []
+        written_files: list[ds.WrittenFile] = []
 
         dataset_filesystem, dataset_path = self.parse_location(self.data_records_root)
         if isinstance(dataset_path, list):
             raise TypeError(
                 "Dataset location must be the root of a single dataset for writing"
             )
 
+        # write ETL parquet records
         record_batches_iter = self.create_record_batches(records_iter)
-
         ds.write_dataset(
             record_batches_iter,
             base_dir=dataset_path,
             basename_template="%s-{i}.parquet" % (str(uuid.uuid4())),  # noqa: UP031
             existing_data_behavior="overwrite_or_ignore",
             filesystem=dataset_filesystem,
-            file_visitor=lambda written_file: self._written_files.append(written_file),  # type: ignore[arg-type]
+            file_visitor=lambda written_file: written_files.append(written_file),  # type: ignore[arg-type]
             format="parquet",
             max_open_files=500,
             max_rows_per_file=self.config.max_rows_per_file,
@@ -399,8 +427,14 @@ def write(
             use_threads=use_threads,
         )
 
-        self.log_write_statistics(start_time)
-        return self._written_files  # type: ignore[return-value]
+        # write metadata append deltas
+        if write_append_deltas:
+            for written_file in written_files:
+                self.metadata.write_append_delta_duckdb(written_file.path)  # type: ignore[attr-defined]
+
+        self.log_write_statistics(start_time, written_files)
+
+        return written_files
 
     def create_record_batches(
         self, records_iter: Iterator["DatasetRecord"]
@@ -423,19 +457,18 @@ def create_record_batches(
             logger.debug(f"Yielding batch {i + 1} for dataset writing.")
             yield batch
 
-    def log_write_statistics(self, start_time: float) -> None:
+    def log_write_statistics(
+        self,
+        start_time: float,
+        written_files: list[ds.WrittenFile],
+    ) -> None:
         """Parse written files from write and log statistics."""
         total_time = round(time.perf_counter() - start_time, 2)
-        total_files = len(self._written_files)
+        total_files = len(written_files)
         total_rows = sum(
-            [
-                wf.metadata.num_rows  # type: ignore[attr-defined]
-                for wf in self._written_files
-            ]
-        )
-        total_size = sum(
-            [wf.size for wf in self._written_files]  # type: ignore[attr-defined]
+            [wf.metadata.num_rows for wf in written_files]  # type: ignore[attr-defined]
         )
+        total_size = sum([wf.size for wf in written_files])  # type: ignore[attr-defined]
         logger.info(
             f"Dataset write complete - elapsed: "
             f"{total_time}s, "
diff --git a/timdex_dataset_api/metadata.py b/timdex_dataset_api/metadata.py
diff --git a/timdex_dataset_api/utils.py b/timdex_dataset_api/utils.py