Add run_record_offset column to dataset

ghukill · ghukill · commit 3a3fbbc0956c · 2025-01-15T13:17:39.000-05:00
Why these changes are being introduced: Bulk reading and writing from the TIMDEX dataset is a primary responsibility, but occassional random access (e.g. locating a single record row) will be helpful (e.g. looking at the original source record for a problematic record). Each TIMDEX JSON record in Opensearch will contain a "provenance" object that will include things like run_date, run_id, and now run_record_offset. This offset allows for quicker (time) and more efficient (data read) retrieval of a single record given information in the TIMDEX provenance object. How this addresses that need: Parquet files have metadata embedded that describe what values can be found in subsets of the file, but this is only helpful when the min/max values in that metadata can inform query engines if a desired record may be present. Unfortunately, the timdex_record_id is a) not lexicographically sortable (at least not easily), and b) are not ordered during write. By adding this offset, effectively an incrementing counter as records are yielded for writing, we have a value that is pre-sorted and provides nice ranges in the parquet file metadata. Query engines can utilize this to dramatically improve random access reads. By including this offset integer in the TIMDEX record "provenance" section we close the loop and provide enough information in the Opensearch record to efficiently retrieve it from the parquet dataset. Side effects of this change: * Dataset will now include a new column 'run_record_offset' Relevant ticket(s): * https://mitlibraries.atlassian.net/browse/TIMX-465
diff --git a/tests/test_records.py b/tests/test_records.py
@@ -16,6 +16,7 @@ def test_dataset_record_init_with_valid_run_date_parses_year_month_day():
         "run_type": "full",
         "action": "index",
         "run_id": "000-111-aaa-bbb",
+        "run_record_offset": 0,
     }
     record = DatasetRecord(**values)
 
@@ -37,6 +38,7 @@ def test_dataset_record_init_with_invalid_run_date_raise_error():
         "run_type": "full",
         "action": "index",
         "run_id": "000-111-aaa-bbb",
+        "run_record_offset": 0,
     }
 
     with pytest.raises(
@@ -55,6 +57,7 @@ def test_dataset_record_serialization():
         "run_type": "full",
         "action": "index",
         "run_id": "abc123",
+        "run_record_offset": 0,
     }
     dataset_record = DatasetRecord(**values)
 
@@ -67,6 +70,7 @@ def test_dataset_record_serialization():
         "run_type": "full",
         "action": "index",
         "run_id": "abc123",
+        "run_record_offset": 0,
         "year": "2024",
         "month": "12",
         "day": "01",
diff --git a/tests/utils.py b/tests/utils.py
@@ -32,6 +32,7 @@ def generate_sample_records(
             run_type=run_type,
             action=action,
             run_id=run_id,
+            run_record_offset=x,
         )
 
 
diff --git a/timdex_dataset_api/__init__.py b/timdex_dataset_api/__init__.py
@@ -3,7 +3,7 @@
 from timdex_dataset_api.dataset import TIMDEXDataset
 from timdex_dataset_api.record import DatasetRecord
 
-__version__ = "0.7.0"
+__version__ = "0.8.0"
 
 __all__ = [
     "DatasetRecord",
diff --git a/timdex_dataset_api/dataset.py b/timdex_dataset_api/dataset.py
@@ -33,8 +33,9 @@
         pa.field("source", pa.string()),
         pa.field("run_date", pa.date32()),
         pa.field("run_type", pa.string()),
-        pa.field("run_id", pa.string()),
         pa.field("action", pa.string()),
+        pa.field("run_id", pa.string()),
+        pa.field("run_record_offset", pa.int32()),
         pa.field("year", pa.string()),
         pa.field("month", pa.string()),
         pa.field("day", pa.string()),
@@ -53,8 +54,9 @@ class DatasetFilters(TypedDict, total=False):
     source: str | None
     run_date: str | date | None
     run_type: str | None
-    run_id: str | None
     action: str | None
+    run_id: str | None
+    run_record_offset: int | None
     year: str | None
     month: str | None
     day: str | None
diff --git a/timdex_dataset_api/record.py b/timdex_dataset_api/record.py
@@ -18,15 +18,15 @@ class DatasetRecord:
     writing.
     """
 
-    # primary columns
     timdex_record_id: str = field()
     source_record: bytes = field()
     transformed_record: bytes = field()
     source: str = field()
     run_date: date = field(converter=strict_date_parse)
     run_type: str = field()
-    run_id: str = field()
     action: str = field()
+    run_id: str = field()
+    run_record_offset: int = field(default=None)
 
     @property
     def year(self) -> str:

Original file line number	Diff line number	Diff line change
`@@ -32,6 +32,7 @@ def generate_sample_records(`
`32`	`32`	`run_type=run_type,`
`33`	`33`	`action=action,`
`34`	`34`	`run_id=run_id,`
	`35`	`+ run_record_offset=x,`
`35`	`36`	`)`
`36`	`37`
`37`	`38`