Rework dataset partitions to only year, month, day

jonavellecuerdo · jonavellecuerdo · commit 76347b1b4cdf · 2024-12-09T15:31:32.000-05:00
Why these changes are being introduced: * These changes simplify the partitioning schema for the TIMDEXDataset, allowing the app to take advantage of PyArrow's memory-efficient processes for reading and writing Parquet datasets. Furthermore, the new partitioning schema will result in a more efficient, coherent folder structure when writing datasets. For more details, see: https://mitlibraries.atlassian.net/wiki/spaces/IN/pages/4094296066/Engineering+Plan+Parquet+Datasets+for+TIMDEX+ETL#Rework-Dataset-Partitions-to-use-only-Year-%2F-Month-%2F-Day. How this addresses that need: * Update TIMDEX_DATASET_SCHEMA to include [year, month, day] * Update DatasetRecord attrs to include [year, month, day] and set [source, run_date, run_type, run_id, action] as primary columns * Add post_init method to DatasetRecord to derive partition values from 'run-date * Remove 'partition' values from DatasetRecord.to_dict * Remove 'partition_values' mixin from TIMDEXDataset.write to reduce complexity and have write method utilize DatasetRecord partition columns instead. * Update unit tests to use new partitions and remove deprecated tests Side effects of this change: * The new partitioning schema introduces a 3-level folder structure within TIMDEXDataset.location (i.e. the base path of the dataset) for [year, month, day], where the leaf node will contain parquet files for every source run. Relevant ticket(s): * https://mitlibraries.atlassian.net/browse/TIMX-432
diff --git a/.gitignore b/.gitignore
@@ -156,3 +156,6 @@ cython_debug/
 
 # PyCharm
 .idea/
+
+# VSCode
+.vscode
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -51,12 +51,7 @@ def sample_records_iter_without_partitions():
 
     def _records_iter(num_records):
         return generate_sample_records(
-            num_records,
-            source=None,
-            run_date=None,
-            run_type=None,
-            action=None,
-            run_id=None,
+            num_records, run_date="invalid run-date", year=None, month=None, day=None
         )
 
     return _records_iter
diff --git a/tests/test_dataset_write.py b/tests/test_dataset_write.py
@@ -1,12 +1,13 @@
 # ruff: noqa: S105, S106, SLF001, PLR2004, PD901, D209, D205
 
-import datetime
 import math
 import os
+import re
 
 import pyarrow.dataset as ds
 import pytest
 
+from tests.utils import generate_sample_records
 from timdex_dataset_api.dataset import (
     MAX_ROWS_PER_FILE,
     TIMDEX_DATASET_SCHEMA,
@@ -17,7 +18,7 @@
 from timdex_dataset_api.record import DatasetRecord
 
 
-def test_dataset_record_serialization():
+def test_dataset_record_init():
     values = {
         "timdex_record_id": "alma:123",
         "source_record": b"<record><title>Hello World.</title></record>",
@@ -26,38 +27,38 @@ def test_dataset_record_serialization():
         "run_date": "2024-12-01",
         "run_type": "full",
         "action": "index",
-        "run_id": "abc123",
+        "run_id": "000-111-aaa-bbb",
+        "year": 2024,
+        "month": 12,
+        "day": 1,
     }
-    dataset_record = DatasetRecord(**values)
-    assert dataset_record.to_dict() == values
+    assert DatasetRecord(**values)
 
 
-def test_dataset_record_serialization_with_partition_values_provided():
-    dataset_record = DatasetRecord(
-        timdex_record_id="alma:123",
-        source_record=b"<record><title>Hello World.</title></record>",
-        transformed_record=b"""{"title":["Hello World."]}""",
-    )
-    partition_values = {
-        "source": "alma",
-        "run_date": "2024-12-01",
-        "run_type": "daily",
-        "action": "index",
-        "run_id": "000-111-aaa-bbb",
-    }
-    assert dataset_record.to_dict(partition_values=partition_values) == {
+def test_dataset_record_init_with_invalid_run_date_raise_error():
+    values = {
         "timdex_record_id": "alma:123",
         "source_record": b"<record><title>Hello World.</title></record>",
         "transformed_record": b"""{"title":["Hello World."]}""",
-        "source": "alma",
-        "run_date": "2024-12-01",
-        "run_type": "daily",
+        "source": "libguides",
+        "run_date": "-12-01",
+        "run_type": "full",
         "action": "index",
         "run_id": "000-111-aaa-bbb",
+        "year": None,
+        "month": None,
+        "day": None,
     }
+    with pytest.raises(
+        InvalidDatasetRecordError,
+        match=re.escape(
+            "Cannot parse partition values [year, month, date] from invalid 'run-date' string."  # noqa: E501
+        ),
+    ):
+        DatasetRecord(**values)
 
 
-def test_dataset_record_serialization_missing_partition_raise_error():
+def test_dataset_record_serialization():
     values = {
         "timdex_record_id": "alma:123",
         "source_record": b"<record><title>Hello World.</title></record>",
@@ -66,14 +67,13 @@ def test_dataset_record_serialization_missing_partition_raise_error():
         "run_date": "2024-12-01",
         "run_type": "full",
         "action": "index",
-        "run_id": None,  # <------ missing partition here
+        "run_id": "abc123",
+        "year": "2024",
+        "month": "12",
+        "day": "01",
     }
     dataset_record = DatasetRecord(**values)
-    with pytest.raises(
-        InvalidDatasetRecordError,
-        match="Partition values are missing: run_id",
-    ):
-        assert dataset_record.to_dict() == values
+    assert dataset_record.to_dict() == values
 
 
 def test_dataset_write_records_to_new_dataset(new_dataset, sample_records_iter):
@@ -134,52 +134,6 @@ def test_dataset_write_to_multiple_locations_raise_error(sample_records_iter):
         timdex_dataset.write(sample_records_iter(10))
 
 
-def test_dataset_write_mixin_partition_values_used(
-    new_dataset, sample_records_iter_without_partitions
-):
-    partition_values = {
-        "source": "alma",
-        "run_date": "2024-12-01",
-        "run_type": "daily",
-        "action": "index",
-        "run_id": "000-111-aaa-bbb",
-    }
-    _written_files = new_dataset.write(
-        sample_records_iter_without_partitions(10),
-        partition_values=partition_values,
-    )
-    new_dataset.reload()
-
-    # load as pandas dataframe and assert column values
-    df = new_dataset.dataset.to_table().to_pandas()
-    row = df.iloc[0]
-    assert row.source == partition_values["source"]
-    assert row.run_date == datetime.date(2024, 12, 1)
-    assert row.run_type == partition_values["run_type"]
-    assert row.action == partition_values["action"]
-    assert row.action == partition_values["action"]
-
-
-def test_dataset_write_schema_partitions_correctly_ordered(
-    new_dataset, sample_records_iter
-):
-    written_files = new_dataset.write(
-        sample_records_iter(10),
-        partition_values={
-            "source": "alma",
-            "run_date": "2024-12-01",
-            "run_type": "daily",
-            "run_id": "000-111-aaa-bbb",
-            "action": "index",
-        },
-    )
-    file = written_files[0]
-    assert (
-        "/source=alma/run_date=2024-12-01/run_type=daily"
-        "/run_id=000-111-aaa-bbb/action=index/" in file.path
-    )
-
-
 def test_dataset_write_schema_applied_to_dataset(new_dataset, sample_records_iter):
     new_dataset.write(sample_records_iter(10))
 
@@ -199,38 +153,20 @@ def test_dataset_write_partition_deleted_when_written_to_again(
 ):
     """This tests the existing_data_behavior="delete_matching" configuration when writing
     to a dataset."""
-    partition_values = {
-        "source": "alma",
-        "run_date": "2024-12-01",
-        "run_type": "daily",
-        "action": "index",
-        "run_id": "000-111-aaa-bbb",
-    }
-
     # perform FIRST write to run_date="2024-12-01"
-    written_files_1 = new_dataset.write(
-        sample_records_iter(10),
-        partition_values=partition_values,
-    )
+    written_files_1 = new_dataset.write(sample_records_iter(10))
 
     # assert that files from first write are present at this time
     assert os.path.exists(written_files_1[0].path)
 
     # perform unrelated write with new run_date to confirm this is untouched during delete
-    new_partition_values = partition_values.copy()
-    new_partition_values["run_date"] = "2024-12-15"
-    new_partition_values["run_id"] = "222-333-ccc-ddd"
     written_files_x = new_dataset.write(
-        sample_records_iter(7),
-        partition_values=new_partition_values,
+        generate_sample_records(7, run_date="2024-12-15"),
     )
 
     # perform SECOND write to run_date="2024-12-01", expecting this to delete everything
     # under this combination of partitions (i.e. the first write)
-    written_files_2 = new_dataset.write(
-        sample_records_iter(10),
-        partition_values=partition_values,
-    )
+    written_files_2 = new_dataset.write(sample_records_iter(10))
 
     new_dataset.reload()
 
@@ -243,18 +179,3 @@ def test_dataset_write_partition_deleted_when_written_to_again(
     assert not os.path.exists(written_files_1[0].path)
     assert os.path.exists(written_files_2[0].path)
     assert os.path.exists(written_files_x[0].path)
-
-
-def test_dataset_write_missing_partitions_raise_error(new_dataset, sample_records_iter):
-    missing_partition_values = {
-        "source": "libguides",
-        "run_date": None,
-        "run_type": None,
-        "action": None,
-        "run_id": None,
-    }
-    with pytest.raises(InvalidDatasetRecordError, match="Partition values are missing"):
-        _ = new_dataset.write(
-            sample_records_iter(10),
-            partition_values=missing_partition_values,
-        )
diff --git a/tests/utils.py b/tests/utils.py
@@ -17,6 +17,9 @@ def generate_sample_records(
     run_type: str | None = "daily",
     action: str | None = "index",
     run_id: str | None = None,
+    year: str | int | None = "2024",
+    month: str | int | None = "12",
+    day: str | int | None = "1",
 ) -> Iterator[DatasetRecord]:
     """Generate sample DatasetRecords."""
     if not run_id:
@@ -32,6 +35,9 @@ def generate_sample_records(
             run_type=run_type,
             action=action,
             run_id=run_id,
+            year=year,
+            month=month,
+            day=day,
         )
 
 
diff --git a/timdex_dataset_api/dataset.py b/timdex_dataset_api/dataset.py
@@ -1,6 +1,5 @@
 """timdex_dataset_api/dataset.py"""
 
-import datetime
 import itertools
 import time
 import uuid
@@ -30,15 +29,16 @@
         pa.field("run_type", pa.string()),
         pa.field("run_id", pa.string()),
         pa.field("action", pa.string()),
+        pa.field("year", pa.string()),
+        pa.field("month", pa.string()),
+        pa.field("day", pa.string()),
     )
 )
 
 TIMDEX_DATASET_PARTITION_COLUMNS = [
-    "source",
-    "run_date",
-    "run_type",
-    "run_id",
-    "action",
+    "year",
+    "month",
+    "day",
 ]
 
 DEFAULT_BATCH_SIZE = 1_000
@@ -166,16 +166,12 @@ def write(
         self,
         records_iter: Iterator["DatasetRecord"],
         *,
-        partition_values: dict[str, str | datetime.datetime] | None = None,
         batch_size: int = DEFAULT_BATCH_SIZE,
         use_threads: bool = True,
     ) -> list[ds.WrittenFile]:
         """Write records to the TIMDEX parquet dataset.
 
-        This method expects an iterator of DatasetRecord instances, with optional
-        partition column values that will be applied to all rows written (often, these
-        are the same for all rows written, eliminating the need to repeat those values
-        in the iterator).
+        This method expects an iterator of DatasetRecord instances.
 
         This method encapsulates all dataset writing mechanics and performance
         optimizations (e.g. batching) so that the calling context can focus on yielding
@@ -192,7 +188,6 @@ def write(
 
         Args:
             - records_iter: Iterator of DatasetRecord instances
-            - partition_values: dictionary of static partition column name/value pairs
             - batch_size: size for batches to yield and write, directly affecting row
                 group size in final parquet files
             - use_threads: boolean if threads should be used for writing
@@ -207,7 +202,6 @@ def write(
 
         record_batches_iter = self.get_dataset_record_batches(
             records_iter,
-            partition_values=partition_values,
             batch_size=batch_size,
         )
 
@@ -235,32 +229,24 @@ def get_dataset_record_batches(
         self,
         records_iter: Iterator["DatasetRecord"],
         *,
-        partition_values: dict[str, str | datetime.datetime] | None = None,
         batch_size: int = DEFAULT_BATCH_SIZE,
     ) -> Iterator[pa.RecordBatch]:
         """Yield pyarrow.RecordBatches for writing.
 
-        This method expects an iterator of DatasetRecord instances, with optional
-        partition column values that will be applied to all rows written (often, these
-        are the same for all rows written, eliminating the need to repeat those values
-        in the iterator).
+        This method expects an iterator of DatasetRecord instances.
 
         Each DatasetRecord is validated and serialized to a dictionary before added to a
         pyarrow.RecordBatch for writing.
 
         Args:
             - records_iter: Iterator of DatasetRecord instances
-            - partition_values: dictionary of static partition column name/value pairs
             - batch_size: size for batches to yield and write, directly affecting row
                 group size in final parquet files
         """
         for i, record_batch in enumerate(itertools.batched(records_iter, batch_size)):
             batch_start_time = time.perf_counter()
             batch = pa.RecordBatch.from_pylist(
-                [
-                    record.to_dict(partition_values=partition_values)
-                    for record in record_batch
-                ]
+                [record.to_dict() for record in record_batch]
             )
             logger.debug(
                 f"Batch {i + 1} yielded for writing, "
diff --git a/timdex_dataset_api/record.py b/timdex_dataset_api/record.py