Skip to content

Commit 3a3fbbc

Browse files
committed
Add run_record_offset column to dataset
Why these changes are being introduced: Bulk reading and writing from the TIMDEX dataset is a primary responsibility, but occassional random access (e.g. locating a single record row) will be helpful (e.g. looking at the original source record for a problematic record). Each TIMDEX JSON record in Opensearch will contain a "provenance" object that will include things like run_date, run_id, and now run_record_offset. This offset allows for quicker (time) and more efficient (data read) retrieval of a single record given information in the TIMDEX provenance object. How this addresses that need: Parquet files have metadata embedded that describe what values can be found in subsets of the file, but this is only helpful when the min/max values in that metadata can inform query engines if a desired record may be present. Unfortunately, the timdex_record_id is a) not lexicographically sortable (at least not easily), and b) are not ordered during write. By adding this offset, effectively an incrementing counter as records are yielded for writing, we have a value that is pre-sorted and provides nice ranges in the parquet file metadata. Query engines can utilize this to dramatically improve random access reads. By including this offset integer in the TIMDEX record "provenance" section we close the loop and provide enough information in the Opensearch record to efficiently retrieve it from the parquet dataset. Side effects of this change: * Dataset will now include a new column 'run_record_offset' Relevant ticket(s): * https://mitlibraries.atlassian.net/browse/TIMX-465
1 parent 07f75ee commit 3a3fbbc

5 files changed

Lines changed: 12 additions & 5 deletions

File tree

tests/test_records.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ def test_dataset_record_init_with_valid_run_date_parses_year_month_day():
1616
"run_type": "full",
1717
"action": "index",
1818
"run_id": "000-111-aaa-bbb",
19+
"run_record_offset": 0,
1920
}
2021
record = DatasetRecord(**values)
2122

@@ -37,6 +38,7 @@ def test_dataset_record_init_with_invalid_run_date_raise_error():
3738
"run_type": "full",
3839
"action": "index",
3940
"run_id": "000-111-aaa-bbb",
41+
"run_record_offset": 0,
4042
}
4143

4244
with pytest.raises(
@@ -55,6 +57,7 @@ def test_dataset_record_serialization():
5557
"run_type": "full",
5658
"action": "index",
5759
"run_id": "abc123",
60+
"run_record_offset": 0,
5861
}
5962
dataset_record = DatasetRecord(**values)
6063

@@ -67,6 +70,7 @@ def test_dataset_record_serialization():
6770
"run_type": "full",
6871
"action": "index",
6972
"run_id": "abc123",
73+
"run_record_offset": 0,
7074
"year": "2024",
7175
"month": "12",
7276
"day": "01",

tests/utils.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ def generate_sample_records(
3232
run_type=run_type,
3333
action=action,
3434
run_id=run_id,
35+
run_record_offset=x,
3536
)
3637

3738

timdex_dataset_api/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
from timdex_dataset_api.dataset import TIMDEXDataset
44
from timdex_dataset_api.record import DatasetRecord
55

6-
__version__ = "0.7.0"
6+
__version__ = "0.8.0"
77

88
__all__ = [
99
"DatasetRecord",

timdex_dataset_api/dataset.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,8 +33,9 @@
3333
pa.field("source", pa.string()),
3434
pa.field("run_date", pa.date32()),
3535
pa.field("run_type", pa.string()),
36-
pa.field("run_id", pa.string()),
3736
pa.field("action", pa.string()),
37+
pa.field("run_id", pa.string()),
38+
pa.field("run_record_offset", pa.int32()),
3839
pa.field("year", pa.string()),
3940
pa.field("month", pa.string()),
4041
pa.field("day", pa.string()),
@@ -53,8 +54,9 @@ class DatasetFilters(TypedDict, total=False):
5354
source: str | None
5455
run_date: str | date | None
5556
run_type: str | None
56-
run_id: str | None
5757
action: str | None
58+
run_id: str | None
59+
run_record_offset: int | None
5860
year: str | None
5961
month: str | None
6062
day: str | None

timdex_dataset_api/record.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,15 +18,15 @@ class DatasetRecord:
1818
writing.
1919
"""
2020

21-
# primary columns
2221
timdex_record_id: str = field()
2322
source_record: bytes = field()
2423
transformed_record: bytes = field()
2524
source: str = field()
2625
run_date: date = field(converter=strict_date_parse)
2726
run_type: str = field()
28-
run_id: str = field()
2927
action: str = field()
28+
run_id: str = field()
29+
run_record_offset: int = field(default=None)
3030

3131
@property
3232
def year(self) -> str:

0 commit comments

Comments
 (0)