TIMDEXDataset capable of yielding current records only

ghukill · ghukill · commit 7e0e795d0adf · 2025-05-21T14:19:41.000-04:00
Why these changes are being introduced: With TIMDEXDataset capable of limiting to only parquet files associated with current runs, the next logical step is providing the ability to yield only the current version of a record. This would support a "full refresh" of a TIMDEX source where an application like TIM could yield only current records for a given source and index those to Opensearch. How this addresses that need: When TIMDEXDataset is loaded with current_records=True, the private attribute TIMDEXDataset._dedupe_on_read is set to True, informing any read methods to dedupe during yielding. Because all read methods TIMDEXDataset.read_batches_iter() at the lowest level, the deduping logic is required only there. Because the ordering of the parquet files is already handled by the load method, the read methods can be confident they are always seeing the most recent version of a record first, and thus can just maintain a "seen" list as they are encountered. This keeps the deduplication effectively instant and memory safe; no large in-memory reordering or deduplication is required. Side effects of this change: * Applications like TIM now have the option of yielding only current records for a source, or all sources, supporting new functionality like fully reindexing a source in Opensearch from parquet dataset data alone. Relevant ticket(s): * https://mitlibraries.atlassian.net/browse/TIMX-494
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -138,6 +138,7 @@ def dataset_with_runs_location(tmp_path) -> str:
         num_records, source, run_date, run_type, action, run_id = params
         records = generate_sample_records(
             num_records,
+            timdex_record_id_prefix=source,
             source=source,
             run_date=run_date,
             run_type=run_type,
@@ -147,3 +148,8 @@ def dataset_with_runs_location(tmp_path) -> str:
         timdex_dataset.write(records)
 
     return location
+
+
+@pytest.fixture
+def local_dataset_with_runs(dataset_with_runs_location) -> TIMDEXDataset:
+    return TIMDEXDataset(dataset_with_runs_location)
diff --git a/tests/test_dataset.py b/tests/test_dataset.py
@@ -339,3 +339,57 @@ def test_dataset_local_dataset_row_count_missing_dataset_raise_error(local_datas
     td = TIMDEXDataset(location="path/to/nowhere")
     with pytest.raises(DatasetNotLoadedError):
         _ = td.row_count
+
+
+def test_dataset_all_records_not_current_and_not_deduped(local_dataset_with_runs):
+    local_dataset_with_runs.load()
+    all_records_df = local_dataset_with_runs.read_dataframe()
+
+    # assert counts reflect all records from dataset, no deduping
+    assert all_records_df.source.value_counts().to_dict() == {"alma": 254, "dspace": 194}
+
+    # assert run_date min/max dates align with min/max for all runs
+    assert all_records_df.run_date.min() == date(2024, 12, 1)
+    assert all_records_df.run_date.max() == date(2025, 2, 5)
+
+
+def test_dataset_all_current_records_deduped(local_dataset_with_runs):
+    local_dataset_with_runs.load(current_records=True)
+    all_records_df = local_dataset_with_runs.read_dataframe()
+
+    # assert both sources have accurate record counts for current records only
+    assert all_records_df.source.value_counts().to_dict() == {"dspace": 90, "alma": 100}
+
+    # assert only one "full" run, per source
+    assert len(all_records_df[all_records_df.run_type == "full"].run_id.unique()) == 2
+
+    # assert run_date min/max dates align with both sources min/max dates
+    assert all_records_df.run_date.min() == date(2025, 1, 1)  # both
+    assert all_records_df.run_date.max() == date(2025, 2, 5)  # dspace
+
+
+def test_dataset_source_current_records_deduped(local_dataset_with_runs):
+    local_dataset_with_runs.load(current_records=True, source="alma")
+    alma_records_df = local_dataset_with_runs.read_dataframe()
+
+    # assert only alma records present and correct count
+    assert alma_records_df.source.value_counts().to_dict() == {"alma": 100}
+
+    # assert only one "full" run
+    assert len(alma_records_df[alma_records_df.run_type == "full"].run_id.unique()) == 1
+
+    # assert run_date min/max dates are correct for single source
+    assert alma_records_df.run_date.min() == date(2025, 1, 1)
+    assert alma_records_df.run_date.max() == date(2025, 1, 5)
+
+
+def test_dataset_all_read_methods_get_deduplication(
+    local_dataset_with_runs,
+):
+    local_dataset_with_runs.load(current_records=True, source="alma")
+
+    full_df = local_dataset_with_runs.read_dataframe()
+    all_records = list(local_dataset_with_runs.read_dicts_iter())
+    transformed_records = list(local_dataset_with_runs.read_transformed_records_iter())
+
+    assert len(full_df) == len(all_records) == len(transformed_records)
diff --git a/timdex_dataset_api/dataset.py b/timdex_dataset_api/dataset.py
@@ -120,6 +120,7 @@ def __init__(
         self.schema = TIMDEX_DATASET_SCHEMA
         self.partition_columns = TIMDEX_DATASET_PARTITION_COLUMNS
         self._written_files: list[ds.WrittenFile] = None  # type: ignore[assignment]
+        self._dedupe_on_read: bool = False
 
     @property
     def row_count(self) -> int:
@@ -162,6 +163,7 @@ def load(
         self._load_pyarrow_dataset()
 
         # if current_records flag set, limit to parquet files associated with current runs
+        self._dedupe_on_read = current_records
         if current_records:
             timdex_run_manager = TIMDEXRunManager(timdex_dataset=self)
 
@@ -467,14 +469,55 @@ def read_batches_iter(
                 "Dataset is not loaded. Please call the `load` method first."
             )
         dataset = self._get_filtered_dataset(**filters)
-        for batch in dataset.to_batches(
+
+        batches = dataset.to_batches(
             columns=columns,
             batch_size=self.config.read_batch_size,
             batch_readahead=self.config.batch_read_ahead,
             fragment_readahead=self.config.fragment_read_ahead,
-        ):
+        )
+
+        if self._dedupe_on_read:
+            yield from self._yield_deduped_batches(batches)
+        else:
+            for batch in batches:
+                if len(batch) > 0:
+                    yield batch
+
+    def _yield_deduped_batches(
+        self, batches: Iterator[pa.RecordBatch]
+    ) -> Iterator[pa.RecordBatch]:
+        """Method to yield record deduped batches.
+
+        Extending the normal behavior of yielding batches untouched, this method keeps
+        track of seen timdex_record_id's, yielding them only once.  For this method to
+        yield the most current version of a record -- most common usage -- it is required
+        that the batches are pre-ordered so the most recent record version is encountered
+        first.
+        """
+        seen_records = set()
+        for batch in batches:
             if len(batch) > 0:
-                yield batch
+                # init list of batch indices for records unseen
+                unseen_batch_indices = []
+
+                # get list of timdex ids from batch
+                timdex_ids = batch.column("timdex_record_id").to_pylist()
+
+                # check each record id and track unseen ones
+                for i, record_id in enumerate(timdex_ids):
+                    if record_id not in seen_records:
+                        unseen_batch_indices.append(i)
+                        seen_records.add(record_id)
+
+                # if all records from batch were seen, continue
+                if not unseen_batch_indices:
+                    continue
+
+                # else, yield unseen records from batch
+                deduped_batch = batch.take(pa.array(unseen_batch_indices))  # type: ignore[arg-type]
+                if len(deduped_batch) > 0:
+                    yield deduped_batch
 
     def read_dataframes_iter(
         self,
@@ -536,13 +579,14 @@ def read_transformed_records_iter(
     ) -> Iterator[dict]:
         """Yield individual transformed records as dictionaries from the dataset.
 
-        If 'transformed_record' is None (i.e., action="skip"|"error"), the yield
-        statement will not be executed for the row.
+        If 'transformed_record' is None (common scenarios are action="skip"|"error"), the
+        yield statement will not be executed for the row.  Note that for action="delete" a
+        transformed record still may be yielded if present.
 
         Args: see self.read_batches_iter()
         """
         for record_dict in self.read_dicts_iter(
-            columns=["transformed_record"],
+            columns=["timdex_record_id", "transformed_record"],
             **filters,
         ):
             if transformed_record := record_dict["transformed_record"]: