Validate DatasetRecord during serialization

ghukill · ghukill · commit b346fb3a327a · 2024-12-05T10:41:56.000-05:00
Why these changes are being introduced: When a DatasetRecord is serialized for writing, it contains the dataset partitions that the record will fall under. We do not want any records getting written to the dataset without all required partitions. Additionally, we may want to add future validations to the record, e.g. ensuring the transformed record is valid JSON. This provides a place for that. How this addresses that need: * Adds new DatasetRecord.validate() method, and applies method during to_dict() serialization. Side effects of this change: * Records cannot be written to dataset without all required partitions Relevant ticket(s): * https://mitlibraries.atlassian.net/browse/TIMX-415
diff --git a/tests/test_dataset_write.py b/tests/test_dataset_write.py
@@ -13,25 +13,23 @@
     DatasetNotLoadedError,
     TIMDEXDataset,
 )
+from timdex_dataset_api.exceptions import InvalidDatasetRecordError
 from timdex_dataset_api.record import DatasetRecord
 
 
 def test_dataset_record_serialization():
-    dataset_record = DatasetRecord(
-        timdex_record_id="alma:123",
-        source_record=b"<record><title>Hello World.</title></record>",
-        transformed_record=b"""{"title":["Hello World."]}""",
-    )
-    assert dataset_record.to_dict() == {
+    values = {
         "timdex_record_id": "alma:123",
         "source_record": b"<record><title>Hello World.</title></record>",
         "transformed_record": b"""{"title":["Hello World."]}""",
-        "source": None,
-        "run_date": None,
-        "run_type": None,
-        "action": None,
-        "run_id": None,
+        "source": "libguides",
+        "run_date": "2024-12-01",
+        "run_type": "full",
+        "action": "index",
+        "run_id": "abc123",
     }
+    dataset_record = DatasetRecord(**values)
+    assert dataset_record.to_dict() == values
 
 
 def test_dataset_record_serialization_with_partition_values_provided():
@@ -59,6 +57,25 @@ def test_dataset_record_serialization_with_partition_values_provided():
     }
 
 
+def test_dataset_record_serialization_missing_partition_raise_error():
+    values = {
+        "timdex_record_id": "alma:123",
+        "source_record": b"<record><title>Hello World.</title></record>",
+        "transformed_record": b"""{"title":["Hello World."]}""",
+        "source": "libguides",
+        "run_date": "2024-12-01",
+        "run_type": "full",
+        "action": "index",
+        "run_id": None,  # <------ missing partition here
+    }
+    dataset_record = DatasetRecord(**values)
+    with pytest.raises(
+        InvalidDatasetRecordError,
+        match="Partition values are missing: run_id",
+    ):
+        assert dataset_record.to_dict() == values
+
+
 def test_dataset_write_records_to_new_dataset(new_dataset, sample_records_iter):
     files_written = new_dataset.write(sample_records_iter(10_000))
     assert len(files_written) == 1
@@ -226,3 +243,18 @@ def test_dataset_write_partition_deleted_when_written_to_again(
     assert not os.path.exists(written_files_1[0].path)
     assert os.path.exists(written_files_2[0].path)
     assert os.path.exists(written_files_x[0].path)
+
+
+def test_dataset_write_missing_partitions_raise_error(new_dataset, sample_records_iter):
+    missing_partition_values = {
+        "source": "libguides",
+        "run_date": None,
+        "run_type": None,
+        "action": None,
+        "run_id": None,
+    }
+    with pytest.raises(InvalidDatasetRecordError, match="Partition values are missing"):
+        _ = new_dataset.write(
+            sample_records_iter(10),
+            partition_values=missing_partition_values,
+        )
diff --git a/timdex_dataset_api/dataset.py b/timdex_dataset_api/dataset.py
@@ -180,6 +180,15 @@ def write(
         optimizations (e.g. batching) so that the calling context can focus on yielding
         data.
 
+        For write, the configuration existing_data_behavior="delete_matching" is used.
+        This means that during write, if any pre-existing files are found for the exact
+        combinations of partitions for that batch, those pre-existing files will be
+        deleted.  This effectively makes a write idempotent to the TIMDEX dataset.
+
+        A max_open_files=500 configuration is set to avoid AWS S3 503 error "SLOW_DOWN"
+        if too many PutObject calls are made in parallel.  Testing suggests this does not
+        substantially slow down the overall write.
+
         Args:
             - records_iter: Iterator of DatasetRecord instances
             - partition_values: dictionary of static partition column name/value pairs
@@ -209,7 +218,7 @@ def write(
             filesystem=self.filesystem,
             file_visitor=lambda written_file: written_files.append(written_file),
             format="parquet",
-            max_open_files=500,  # avoids S3 503 "SLOW_DOWN" error for PutObject requests
+            max_open_files=500,
             max_rows_per_file=MAX_ROWS_PER_FILE,
             max_rows_per_group=MAX_ROWS_PER_GROUP,
             partitioning=self.partition_columns,
@@ -235,7 +244,7 @@ def get_dataset_record_batches(
         are the same for all rows written, eliminating the need to repeat those values
         in the iterator).
 
-        Each DatasetRecord is serialized to a dictionary before added to a
+        Each DatasetRecord is validated and serialized to a dictionary before added to a
         pyarrow.RecordBatch for writing.
 
         Args:
diff --git a/timdex_dataset_api/exceptions.py b/timdex_dataset_api/exceptions.py
@@ -3,3 +3,7 @@
 
 class DatasetNotLoadedError(Exception):
     """Custom exception for accessing methods requiring a loaded dataset."""
+
+
+class InvalidDatasetRecordError(Exception):
+    """Custom exception for invalid DatasetRecord instances."""
diff --git a/timdex_dataset_api/record.py b/timdex_dataset_api/record.py
@@ -3,6 +3,8 @@
 import datetime
 from dataclasses import asdict, dataclass
 
+from timdex_dataset_api.exceptions import InvalidDatasetRecordError
+
 
 @dataclass
 class DatasetRecord:
@@ -27,10 +29,27 @@ class DatasetRecord:
 
     def to_dict(
         self,
+        *,
         partition_values: dict[str, str | datetime.datetime] | None = None,
+        validate: bool = True,
     ) -> dict:
         """Serialize instance as dictionary, setting partition values if passed."""
         if partition_values:
             for key, value in partition_values.items():
                 setattr(self, key, value)
+        if validate:
+            self.validate()
         return asdict(self)
+
+    def validate(self) -> None:
+        """Validate DatasetRecord for writing."""
+        # ensure all partition columns are set
+        missing_partition_values = [
+            field
+            for field in ["source", "run_date", "run_type", "action", "run_id"]
+            if getattr(self, field) is None
+        ]
+        if missing_partition_values:
+            raise InvalidDatasetRecordError(
+                f"Partition values are missing: {', '.join(missing_partition_values)}"
+            )