Remove current records functionality in TIMDEXDataset

ghukill · ghukill · commit ff2aff07e194 · 2025-08-04T11:42:00.000-04:00
Why these changes are being introduced: While the TIMDEXDatasetMetadata class is rebuilt, TIMDEXDataset itself can no longer provide "current" records from the dataaset as it has no metadata to work with. This is temporary until TIMDEXDatasetMetadata is rebuilt, and TIMDEXDataset gets new functionality based on *that* new metadata. How this addresses that need: * Any reference to "current records" is removed Side effects of this change: * TIMDEXDataset cannot provide current records Relevant ticket(s): * https://mitlibraries.atlassian.net/browse/TIMX-530
diff --git a/migrations/002_2025_06_25_consistent_run_timestamp_per_etl_run.py b/migrations/002_2025_06_25_consistent_run_timestamp_per_etl_run.py
@@ -1,4 +1,7 @@
-# ruff: noqa: BLE001, D212, TRY300, TRY400
+# ruff: noqa: PGH004
+# ruff: noqa
+# type: ignore
+
 """
 Date: 2025-06-25
 
@@ -29,6 +32,10 @@
 pipenv run python migrations/002_2025_06_25_consistent_run_timestamp_per_etl_run.py \
 <DATASET_LOCATION> \
 --dry-run
+
+Update: 2025-08-04
+
+This migration is no longer functional given changes to TIMDEXDataset.
 """
 
 import argparse
diff --git a/timdex_dataset_api/dataset.py b/timdex_dataset_api/dataset.py
@@ -20,11 +20,11 @@
 
 from timdex_dataset_api.config import configure_logger
 from timdex_dataset_api.exceptions import DatasetNotLoadedError
-from timdex_dataset_api.metadata import TIMDEXDatasetMetadata
 
 if TYPE_CHECKING:
     from timdex_dataset_api.record import DatasetRecord  # pragma: nocover
 
+
 logger = configure_logger(__name__)
 
 TIMDEX_DATASET_SCHEMA = pa.schema(
@@ -126,10 +126,6 @@ def __init__(
         # writing
         self._written_files: list[ds.WrittenFile] = None  # type: ignore[assignment]
 
-        # reading
-        self._current_records: bool = False
-        self.metadata: TIMDEXDatasetMetadata = None  # type: ignore[assignment]
-
     @property
     def row_count(self) -> int:
         """Get row count from loaded dataset."""
@@ -139,8 +135,6 @@ def row_count(self) -> int:
 
     def load(
         self,
-        *,
-        current_records: bool = False,
         **filters: Unpack[DatasetFilters],
     ) -> None:
         """Lazy load a pyarrow.dataset.Dataset and set to self.dataset.
@@ -161,21 +155,12 @@ def load(
             - filters: kwargs typed via DatasetFilters TypedDict
                 - Filters passed directly in method call, e.g. source="alma",
                  run_date="2024-12-20", etc., but are typed according to DatasetFilters.
-            - current_records: bool
-                - if True, all records yielded from this instance will be the current
-                version of the record in the dataset.
         """
         start_time = time.perf_counter()
 
         # reset paths from original location before load
         _, self.paths = self.parse_location(self.location)
 
-        # read dataset metadata if only current records are requested
-        self._current_records = current_records
-        if current_records:
-            self.metadata = TIMDEXDatasetMetadata(timdex_dataset=self)
-            self.paths = self.metadata.get_current_parquet_files(**filters)
-
         # perform initial load of full dataset
         self.dataset = self._load_pyarrow_dataset()
 
@@ -465,10 +450,6 @@ def read_batches_iter(
         While batch_size will limit the max rows per batch, filtering may result in some
         batches having less than this limit.
 
-        If the flag self._current_records is set, this method leans on
-        self._yield_current_record_deduped_batches() to apply deduplication of records to
-        ensure only current versions of the record are ever yielded.
-
         Args:
             - columns: list[str], list of columns to return from the dataset
             - filters: pairs of column:value to filter the dataset
@@ -479,73 +460,16 @@ def read_batches_iter(
             )
         dataset = self._get_filtered_dataset(**filters)
 
-        # if current records, add required columns for deduplication
-        if self._current_records:
-            if not columns:
-                columns = TIMDEX_DATASET_SCHEMA.names
-            columns.extend(["timdex_record_id", "run_id"])
-            columns = list(set(columns))
-
         batches = dataset.to_batches(
             columns=columns,
             batch_size=self.config.read_batch_size,
             batch_readahead=self.config.batch_read_ahead,
             fragment_readahead=self.config.fragment_read_ahead,
         )
 
-        if self._current_records:
-            yield from self._yield_current_record_batches(batches, **filters)
-        else:
-            for batch in batches:
-                if len(batch) > 0:
-                    yield batch
-
-    def _yield_current_record_batches(
-        self,
-        batches: Iterator[pa.RecordBatch],
-        **filters: Unpack[DatasetFilters],
-    ) -> Iterator[pa.RecordBatch]:
-        """Method to yield only the most recent version of each record.
-
-        When multiple versions of a record (same timdex_record_id) exist in the dataset,
-        this method ensures only the most recent version is returned.  If filtering is
-        applied that removes this most recent version of a record, that timdex_record_id
-        will not be yielded at all.
-
-        This method uses TIMDEXDatasetMetadata to provide a mapping of timdex_record_id to
-        run_id for the current ETL run for that record.  While yielding records, only when
-        the timdex_record_id + run_id match the mapping is a record yielded.
-
-        Args:
-            - batches: batches of records to actually yield from
-            - filters: pairs of column:value to filter the dataset metadata required
-        """
-        # get map of timdex_record_id to run_id for current version of that record
-        record_to_run_map = self.metadata.get_current_record_to_run_map(**filters)
-
-        # loop through batches, yielding only current records
         for batch in batches:
-
-            if batch.num_rows == 0:
-                continue
-
-            to_yield_indices = []
-
-            record_ids = batch.column("timdex_record_id").to_pylist()
-            run_ids = batch.column("run_id").to_pylist()
-
-            for i, (record_id, run_id) in enumerate(
-                zip(
-                    record_ids,
-                    run_ids,
-                    strict=True,
-                )
-            ):
-                if record_to_run_map.get(record_id) == run_id:
-                    to_yield_indices.append(i)
-
-            if to_yield_indices:
-                yield batch.take(pa.array(to_yield_indices))  # type: ignore[arg-type]
+            if len(batch) > 0:
+                yield batch
 
     def read_dataframes_iter(
         self,