Support filtering for current_records

ghukill · ghukill · commit ed2ce7b8583e · 2025-05-23T16:23:11.000-04:00
Why these changes are being introduced: Unexpected behavior was possible when using load(current_records=True) and then applying additional filtering to the dataset before reading. In short, a non-current record could be yielded if filtering removed the truly current version of the record. This happened because the reverse chronological marking of "seen" records would not "see" this record and happily yield an older version. How this addresses that need: When load(current_records=True) is used, a clone of the dataset is saved to the TIMDEXDataset object before any additional filtering is applied. This dataset is just metadata, not expensive to store. Then, during any read methods, this dataset is used to provide an exhaustive and ordered list of timdex_record_ids. Even if a record has been filtered out by the read method (e.g. limiting records to only action="index"), this secondary list of timdex_record_ids is used as the authoritative list of "seen" timdex_record_ids. There is a bit of network overhead to this parallel batch reading, but fairly minimal as we are only retrieving the 'timdex_record_id'; perhaps 1-2mb of IO per millions of records. Side effects of this change: * Applications like TIM that will likely use this new functionality to yield only "current" records can do so confidently, and optionally with additional filtering, knowing they will only encounter current versions of a record from the dataset. Relevant ticket(s): * https://mitlibraries.atlassian.net/browse/TIMX-497
diff --git a/tests/test_dataset.py b/tests/test_dataset.py
@@ -1,4 +1,5 @@
-# ruff: noqa: S105, S106, SLF001, PLR2004
+# ruff: noqa: D205, S105, S106, SLF001, PD901, PLR2004
+
 import os
 from datetime import date
 from unittest.mock import MagicMock, patch
@@ -397,3 +398,61 @@ def test_dataset_all_read_methods_get_deduplication(
     transformed_records = list(local_dataset_with_runs.read_transformed_records_iter())
 
     assert len(full_df) == len(all_records) == len(transformed_records)
+
+
+def test_dataset_current_records_no_additional_filtering_accurate_records_yielded(
+    local_dataset_with_runs,
+):
+    local_dataset_with_runs.load(current_records=True, source="alma")
+    df = local_dataset_with_runs.read_dataframe()
+    assert df.action.value_counts().to_dict() == {"index": 99, "delete": 1}
+
+
+def test_dataset_current_records_action_filtering_accurate_records_yielded(
+    local_dataset_with_runs,
+):
+    local_dataset_with_runs.load(current_records=True, source="alma")
+    df = local_dataset_with_runs.read_dataframe(action="index")
+    assert df.action.value_counts().to_dict() == {"index": 99}
+
+
+def test_dataset_current_records_index_filtering_accurate_records_yielded(
+    local_dataset_with_runs,
+):
+    """This is a somewhat complex test, but demonstrates that only 'current' records
+    are yielded when .load(current_records=True) is applied.
+
+    Given these runs from the fixture:
+    [
+        ...
+        (25, "alma", "2025-01-03", "daily", "index", "run-5"),   <---- filtered to
+        (10, "alma", "2025-01-04", "daily", "delete", "run-6"),  <---- influences current
+        ...
+    ]
+
+    Though we are filtering to run-5, which has 25 total records to-index, we see only 15
+    records yielded.  Why?  This is because while we have filtered to only yield from
+    run-5, run-6 had 10 deletes which made records alma:0|9 no longer "current" in run-5.
+    As we yielded records reverse chronologically, the deletes from run-6 (alma:0-alma:9)
+    "influenced" what records we would see as we continue backwards in time.
+    """
+    local_dataset_with_runs.load(current_records=True, source="alma")
+    df = local_dataset_with_runs.read_dataframe(run_id="run-5")
+    assert df.action.value_counts().to_dict() == {"index": 15}
+    assert list(df.timdex_record_id) == [
+        "alma:10",
+        "alma:11",
+        "alma:12",
+        "alma:13",
+        "alma:14",
+        "alma:15",
+        "alma:16",
+        "alma:17",
+        "alma:18",
+        "alma:19",
+        "alma:20",
+        "alma:21",
+        "alma:22",
+        "alma:23",
+        "alma:24",
+    ]
diff --git a/timdex_dataset_api/dataset.py b/timdex_dataset_api/dataset.py
@@ -120,7 +120,9 @@ def __init__(
         self.schema = TIMDEX_DATASET_SCHEMA
         self.partition_columns = TIMDEX_DATASET_PARTITION_COLUMNS
         self._written_files: list[ds.WrittenFile] = None  # type: ignore[assignment]
-        self._dedupe_on_read: bool = False
+
+        self._current_records: bool = False
+        self._current_records_dataset: ds.Dataset = None  # type: ignore[assignment]
 
     @property
     def row_count(self) -> int:
@@ -153,27 +155,32 @@ def load(
             - filters: kwargs typed via DatasetFilters TypedDict
                 - Filters passed directly in method call, e.g. source="alma",
                  run_date="2024-12-20", etc., but are typed according to DatasetFilters.
+            - current_records: bool
+                - if True, the TIMDEXRunManager will be used to retrieve a list of parquet
+                files associated with current runs, some internal flags will be set, all
+                ensuring that only current records are yielded for any read methods
         """
         start_time = time.perf_counter()
 
         # reset paths from original location before load
         _, self.paths = self.parse_location(self.location)
 
         # perform initial load of full dataset
-        self._load_pyarrow_dataset()
+        self.dataset = self._load_pyarrow_dataset()
 
-        # if current_records flag set, limit to parquet files associated with current runs
-        self._dedupe_on_read = current_records
+        self._current_records = current_records
         if current_records:
-            timdex_run_manager = TIMDEXRunManager(timdex_dataset=self)
 
             timdex_run_manager = TIMDEXRunManager(dataset=self.dataset)
             self.paths = timdex_run_manager.get_current_parquet_files(
                 source=filters.get("source")
             )
 
-            # reload pyarrow dataset
-            self._load_pyarrow_dataset()
+            # reload pyarrow dataset, filtered now to an explicit list of parquet files
+            # also save an instance of the dataset before any additional filtering
+            dataset = self._load_pyarrow_dataset()
+            self.dataset = dataset
+            self._current_records_dataset = dataset
 
         # filter dataset
         self.dataset = self._get_filtered_dataset(**filters)
@@ -183,9 +190,9 @@ def load(
             f"{round(time.perf_counter()-start_time, 2)}s"
         )
 
-    def _load_pyarrow_dataset(self) -> None:
+    def _load_pyarrow_dataset(self) -> ds.Dataset:
         """Load the pyarrow dataset per local filesystem and paths attributes."""
-        self.dataset = ds.dataset(
+        return ds.dataset(
             self.paths,
             schema=self.schema,
             format="parquet",
@@ -449,19 +456,14 @@ def read_batches_iter(
         """Yield pyarrow.RecordBatches from the dataset.
 
         While batch_size will limit the max rows per batch, filtering may result in some
-        batches have less than this limit.
+        batches having less than this limit.
+
+        If the flag self._current_records is set, this method leans on
+        self._yield_current_record_deduped_batches() to apply deduplication of records to
+        ensure only current versions of the record are ever yielded.
 
         Args:
             - columns: list[str], list of columns to return from the dataset
-            - batch_size: int, max number of rows to yield per batch
-            - batch_read_ahead: int, the number of batches to read ahead in a file. This
-                might not work for all file formats. Increasing this number will increase
-                RAM usage but could also improve IO utilization. Pyarrow default is 16,
-                but this library defaults to 0 to prioritize memory footprint.
-            - fragment_read_ahead: int, The number of files to read ahead. Increasing this
-                number will increase RAM usage but could also improve IO utilization.
-                Pyarrow default is 4, but this library defaults to 0 to prioritize memory
-                footprint.
             - filters: pairs of column:value to filter the dataset
         """
         if not self.dataset:
@@ -477,47 +479,82 @@ def read_batches_iter(
             fragment_readahead=self.config.fragment_read_ahead,
         )
 
-        if self._dedupe_on_read:
-            yield from self._yield_deduped_batches(batches)
+        if self._current_records:
+            yield from self._yield_current_record_deduped_batches(batches)
         else:
             for batch in batches:
                 if len(batch) > 0:
                     yield batch
 
-    def _yield_deduped_batches(
-        self, batches: Iterator[pa.RecordBatch]
+    def _yield_current_record_deduped_batches(
+        self,
+        batches: Iterator[pa.RecordBatch],
     ) -> Iterator[pa.RecordBatch]:
-        """Method to yield record deduped batches.
+        """Method to yield only the most recent version of each record.
+
+        When multiple versions of a record (same timdex_record_id) exist in the dataset,
+        this method ensures only the most recent version is returned.  If filtering is
+        applied that removes this most recent version of a record, that timdex_record_id
+        will not be yielded at all.
+
+        This is achieved by iterating over TWO record batch iterators in parallel:
+
+            1. "batches" - the RecordBatch iterator passed to this method which
+            contains the actual records and columns we are interested in, and may contain
+            filtering
+
+            2. "id_batches" - a lightweight RecordBatch iterator that only contains the
+            'timdex_record_id' column from a pre-filtering dataset saved during .load()
+
+        These two iterators are guaranteed to have the same number of total batches based
+        on how pyarrow.Dataset.to_batches() reads from parquet files.  Even if dataset
+        filtering is applied, this does not affect the batch count; you may just end up
+        with smaller or empty batches.
 
-        Extending the normal behavior of yielding batches untouched, this method keeps
-        track of seen timdex_record_id's, yielding them only once.  For this method to
-        yield the most current version of a record -- most common usage -- it is required
-        that the batches are pre-ordered so the most recent record version is encountered
-        first.
+        As such, as we move through the batches we use batches from the "ids_iterator" to
+        keep a list of seen timdex_record_id's.  Even if a timdex_record_is not in the
+        "records_iterator", likely due to filtering, we will mark the truly most current
+        version as "seen" and not yield it from any future batches.
+
+        Args:
+            - batches: batches of records to actually yield from
+            - current_record_id_batches: batches of timdex_record_id's that inform when
+            to yield or skip a record for a batch
         """
+        # create a RecordBatch iterator from self._current_records_dataset, which was
+        # saved during .load() before any filtering was applied
+        id_batches = self._current_records_dataset.to_batches(
+            columns=["timdex_record_id"],
+            batch_size=self.config.read_batch_size,
+            batch_readahead=self.config.batch_read_ahead,
+            fragment_readahead=self.config.fragment_read_ahead,
+        )
+
         seen_records = set()
-        for batch in batches:
-            if len(batch) > 0:
-                # init list of batch indices for records unseen
-                unseen_batch_indices = []
-
-                # get list of timdex ids from batch
-                timdex_ids = batch.column("timdex_record_id").to_pylist()
-
-                # check each record id and track unseen ones
-                for i, record_id in enumerate(timdex_ids):
-                    if record_id not in seen_records:
-                        unseen_batch_indices.append(i)
-                        seen_records.add(record_id)
-
-                # if all records from batch were seen, continue
-                if not unseen_batch_indices:
-                    continue
-
-                # else, yield unseen records from batch
-                deduped_batch = batch.take(pa.array(unseen_batch_indices))  # type: ignore[arg-type]
-                if len(deduped_batch) > 0:
-                    yield deduped_batch
+        for id_batch, batch in zip(id_batches, batches, strict=True):
+            dedupe_ids = id_batch.column("timdex_record_id").to_pylist()
+            batch_ids = batch.column("timdex_record_id").to_pylist()
+
+            # init list of indices from the batch for records we have never yielded
+            unseen_batch_indices = []
+
+            # check each record id and track unseen ones
+            for i, record_id in enumerate(batch_ids):
+                if record_id not in seen_records:
+                    unseen_batch_indices.append(i)
+
+            # even if not a record to yield, update our list of seen records from all
+            # records in the id_batch
+            seen_records.update(dedupe_ids)
+
+            # if no records unseen this batch, skip yielding
+            if not unseen_batch_indices:
+                continue
+
+            # use the unseen indices to create a new, subset of the batch and yield it
+            _batch = batch.take(pa.array(unseen_batch_indices))  # type: ignore[arg-type]
+            if len(_batch) > 0:
+                yield _batch
 
     def read_dataframes_iter(
         self,