Merge pull request #144 from MITLibraries/TIMX-494-source-current-runs-and-records

ghukill · web-flow · commit 00b8d2a6ae4b · 2025-05-23T15:37:05.000-04:00
TIMX 494 - yield deduped, most recent records
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -138,6 +138,7 @@ def dataset_with_runs_location(tmp_path) -> str:
         num_records, source, run_date, run_type, action, run_id = params
         records = generate_sample_records(
             num_records,
+            timdex_record_id_prefix=source,
             source=source,
             run_date=run_date,
             run_type=run_type,
@@ -147,3 +148,8 @@ def dataset_with_runs_location(tmp_path) -> str:
         timdex_dataset.write(records)
 
     return location
+
+
+@pytest.fixture
+def local_dataset_with_runs(dataset_with_runs_location) -> TIMDEXDataset:
+    return TIMDEXDataset(dataset_with_runs_location)
diff --git a/tests/test_dataset.py b/tests/test_dataset.py
@@ -24,7 +24,7 @@
 def test_dataset_init_success(location, expected_file_system, expected_source):
     timdex_dataset = TIMDEXDataset(location=location)
     assert isinstance(timdex_dataset.filesystem, expected_file_system)
-    assert timdex_dataset.source == expected_source
+    assert timdex_dataset.paths == expected_source
 
 
 def test_dataset_init_env_vars_set_config(monkeypatch, local_dataset_location):
@@ -79,8 +79,7 @@ def test_dataset_load_s3_sets_filesystem_and_dataset_success(
     timdex_dataset = TIMDEXDataset(location="s3://bucket/path/to/dataset")
     result = timdex_dataset.load()
 
-    mock_get_s3_fs.assert_called_once()
-    mock_pyarrow_ds.assert_called_once_with(
+    mock_pyarrow_ds.assert_called_with(
         "bucket/path/to/dataset",
         schema=timdex_dataset.schema,
         format="parquet",
@@ -137,6 +136,26 @@ def test_dataset_load_with_multi_nonpartition_filters_success(fixed_local_datase
     assert fixed_local_dataset.row_count == 1
 
 
+def test_dataset_load_current_records_all_sources_success(dataset_with_runs_location):
+    timdex_dataset = TIMDEXDataset(dataset_with_runs_location)
+
+    # 16 total parquet files, with current_records=False we get them all
+    timdex_dataset.load(current_records=False)
+    assert len(timdex_dataset.dataset.files) == 16
+
+    # 16 total parquet files, with current_records=True we only get 12 for current runs
+    timdex_dataset.load(current_records=True)
+    assert len(timdex_dataset.dataset.files) == 12
+
+
+def test_dataset_load_current_records_one_source_success(dataset_with_runs_location):
+    timdex_dataset = TIMDEXDataset(dataset_with_runs_location)
+    timdex_dataset.load(current_records=True, source="alma")
+
+    # 7 total parquet files for source, only 6 related to current runs
+    assert len(timdex_dataset.dataset.files) == 6
+
+
 def test_dataset_get_filtered_dataset_with_single_nonpartition_success(
     fixed_local_dataset,
 ):
@@ -324,3 +343,57 @@ def test_dataset_local_dataset_row_count_missing_dataset_raise_error(local_datas
     td = TIMDEXDataset(location="path/to/nowhere")
     with pytest.raises(DatasetNotLoadedError):
         _ = td.row_count
+
+
+def test_dataset_all_records_not_current_and_not_deduped(local_dataset_with_runs):
+    local_dataset_with_runs.load()
+    all_records_df = local_dataset_with_runs.read_dataframe()
+
+    # assert counts reflect all records from dataset, no deduping
+    assert all_records_df.source.value_counts().to_dict() == {"alma": 254, "dspace": 194}
+
+    # assert run_date min/max dates align with min/max for all runs
+    assert all_records_df.run_date.min() == date(2024, 12, 1)
+    assert all_records_df.run_date.max() == date(2025, 2, 5)
+
+
+def test_dataset_all_current_records_deduped(local_dataset_with_runs):
+    local_dataset_with_runs.load(current_records=True)
+    all_records_df = local_dataset_with_runs.read_dataframe()
+
+    # assert both sources have accurate record counts for current records only
+    assert all_records_df.source.value_counts().to_dict() == {"dspace": 90, "alma": 100}
+
+    # assert only one "full" run, per source
+    assert len(all_records_df[all_records_df.run_type == "full"].run_id.unique()) == 2
+
+    # assert run_date min/max dates align with both sources min/max dates
+    assert all_records_df.run_date.min() == date(2025, 1, 1)  # both
+    assert all_records_df.run_date.max() == date(2025, 2, 5)  # dspace
+
+
+def test_dataset_source_current_records_deduped(local_dataset_with_runs):
+    local_dataset_with_runs.load(current_records=True, source="alma")
+    alma_records_df = local_dataset_with_runs.read_dataframe()
+
+    # assert only alma records present and correct count
+    assert alma_records_df.source.value_counts().to_dict() == {"alma": 100}
+
+    # assert only one "full" run
+    assert len(alma_records_df[alma_records_df.run_type == "full"].run_id.unique()) == 1
+
+    # assert run_date min/max dates are correct for single source
+    assert alma_records_df.run_date.min() == date(2025, 1, 1)
+    assert alma_records_df.run_date.max() == date(2025, 1, 5)
+
+
+def test_dataset_all_read_methods_get_deduplication(
+    local_dataset_with_runs,
+):
+    local_dataset_with_runs.load(current_records=True, source="alma")
+
+    full_df = local_dataset_with_runs.read_dataframe()
+    all_records = list(local_dataset_with_runs.read_dicts_iter())
+    transformed_records = list(local_dataset_with_runs.read_transformed_records_iter())
+
+    assert len(full_df) == len(all_records) == len(transformed_records)
diff --git a/tests/test_runs.py b/tests/test_runs.py
@@ -56,14 +56,27 @@ def test_timdex_run_manager_get_runs_df(timdex_run_manager):
     assert runs_df.source.value_counts().to_dict() == {"alma": 7, "dspace": 7}
 
 
+def test_timdex_run_manager_get_all_current_run_parquet_files_success(
+    timdex_run_manager,
+):
+    ordered_parquet_files = timdex_run_manager.get_current_parquet_files()
+
+    # assert 12 parquet files, despite being 14 total for ALL sources
+    # this represents the last full run and all daily since
+    assert len(ordered_parquet_files) == 12
+
+    # assert sorted reverse chronologically
+    assert "year=2025/month=01/day=01" in ordered_parquet_files[-1]
+
+
 def test_timdex_run_manager_get_source_current_run_parquet_files_success(
     timdex_run_manager,
 ):
-    ordered_parquet_files = timdex_run_manager.get_current_source_parquet_files("alma")
+    ordered_parquet_files = timdex_run_manager._get_current_source_parquet_files("alma")
 
-    # assert 6 parquet files, despite being 8 total for alma
+    # assert 6 parquet files, despite being 8 total for 'alma' source
     # this represents the last full run and all daily since
-    assert len(ordered_parquet_files)
+    assert len(ordered_parquet_files) == 6
 
     # assert sorted reverse chronologically
     assert "year=2025/month=01/day=05" in ordered_parquet_files[0]
diff --git a/timdex_dataset_api/__init__.py b/timdex_dataset_api/__init__.py
@@ -3,7 +3,7 @@
 from timdex_dataset_api.dataset import TIMDEXDataset
 from timdex_dataset_api.record import DatasetRecord
 
-__version__ = "1.0.0"
+__version__ = "2.0.0"
 
 __all__ = [
     "DatasetRecord",
diff --git a/timdex_dataset_api/dataset.py b/timdex_dataset_api/dataset.py
@@ -20,6 +20,7 @@
 
 from timdex_dataset_api.config import configure_logger
 from timdex_dataset_api.exceptions import DatasetNotLoadedError
+from timdex_dataset_api.run import TIMDEXRunManager
 
 if TYPE_CHECKING:
     from timdex_dataset_api.record import DatasetRecord  # pragma: nocover
@@ -114,11 +115,12 @@ def __init__(
         self.location = location
         self.config = config or TIMDEXDatasetConfig()
 
-        self.filesystem, self.source = self.parse_location(self.location)
+        self.filesystem, self.paths = self.parse_location(self.location)
         self.dataset: ds.Dataset = None  # type: ignore[assignment]
         self.schema = TIMDEX_DATASET_SCHEMA
         self.partition_columns = TIMDEX_DATASET_PARTITION_COLUMNS
         self._written_files: list[ds.WrittenFile] = None  # type: ignore[assignment]
+        self._dedupe_on_read: bool = False
 
     @property
     def row_count(self) -> int:
@@ -129,6 +131,8 @@ def row_count(self) -> int:
 
     def load(
         self,
+        *,
+        current_records: bool = False,
         **filters: Unpack[DatasetFilters],
     ) -> None:
         """Lazy load a pyarrow.dataset.Dataset and set to self.dataset.
@@ -152,14 +156,24 @@ def load(
         """
         start_time = time.perf_counter()
 
-        # load dataset
-        self.dataset = ds.dataset(
-            self.source,
-            schema=self.schema,
-            format="parquet",
-            partitioning="hive",
-            filesystem=self.filesystem,
-        )
+        # reset paths from original location before load
+        _, self.paths = self.parse_location(self.location)
+
+        # perform initial load of full dataset
+        self._load_pyarrow_dataset()
+
+        # if current_records flag set, limit to parquet files associated with current runs
+        self._dedupe_on_read = current_records
+        if current_records:
+            timdex_run_manager = TIMDEXRunManager(timdex_dataset=self)
+
+            # update paths, limiting by source if set
+            self.paths = timdex_run_manager.get_current_parquet_files(
+                source=filters.get("source")
+            )
+
+            # reload pyarrow dataset
+            self._load_pyarrow_dataset()
 
         # filter dataset
         self.dataset = self._get_filtered_dataset(**filters)
@@ -169,6 +183,16 @@ def load(
             f"{round(time.perf_counter()-start_time, 2)}s"
         )
 
+    def _load_pyarrow_dataset(self) -> None:
+        """Load the pyarrow dataset per local filesystem and paths attributes."""
+        self.dataset = ds.dataset(
+            self.paths,
+            schema=self.schema,
+            format="parquet",
+            partitioning="hive",
+            filesystem=self.filesystem,
+        )
+
     def _get_filtered_dataset(
         self,
         **filters: Unpack[DatasetFilters],
@@ -345,7 +369,8 @@ def write(
         start_time = time.perf_counter()
         self._written_files = []
 
-        if isinstance(self.source, list):
+        dataset_filesystem, dataset_path = self.parse_location(self.location)
+        if isinstance(dataset_path, list):
             raise TypeError(
                 "Dataset location must be the root of a single dataset for writing"
             )
@@ -354,10 +379,10 @@ def write(
 
         ds.write_dataset(
             record_batches_iter,
-            base_dir=self.source,
+            base_dir=dataset_path,
             basename_template="%s-{i}.parquet" % (str(uuid.uuid4())),  # noqa: UP031
             existing_data_behavior="overwrite_or_ignore",
-            filesystem=self.filesystem,
+            filesystem=dataset_filesystem,
             file_visitor=lambda written_file: self._written_files.append(written_file),  # type: ignore[arg-type]
             format="parquet",
             max_open_files=500,
@@ -444,14 +469,55 @@ def read_batches_iter(
                 "Dataset is not loaded. Please call the `load` method first."
             )
         dataset = self._get_filtered_dataset(**filters)
-        for batch in dataset.to_batches(
+
+        batches = dataset.to_batches(
             columns=columns,
             batch_size=self.config.read_batch_size,
             batch_readahead=self.config.batch_read_ahead,
             fragment_readahead=self.config.fragment_read_ahead,
-        ):
+        )
+
+        if self._dedupe_on_read:
+            yield from self._yield_deduped_batches(batches)
+        else:
+            for batch in batches:
+                if len(batch) > 0:
+                    yield batch
+
+    def _yield_deduped_batches(
+        self, batches: Iterator[pa.RecordBatch]
+    ) -> Iterator[pa.RecordBatch]:
+        """Method to yield record deduped batches.
+
+        Extending the normal behavior of yielding batches untouched, this method keeps
+        track of seen timdex_record_id's, yielding them only once.  For this method to
+        yield the most current version of a record -- most common usage -- it is required
+        that the batches are pre-ordered so the most recent record version is encountered
+        first.
+        """
+        seen_records = set()
+        for batch in batches:
             if len(batch) > 0:
-                yield batch
+                # init list of batch indices for records unseen
+                unseen_batch_indices = []
+
+                # get list of timdex ids from batch
+                timdex_ids = batch.column("timdex_record_id").to_pylist()
+
+                # check each record id and track unseen ones
+                for i, record_id in enumerate(timdex_ids):
+                    if record_id not in seen_records:
+                        unseen_batch_indices.append(i)
+                        seen_records.add(record_id)
+
+                # if all records from batch were seen, continue
+                if not unseen_batch_indices:
+                    continue
+
+                # else, yield unseen records from batch
+                deduped_batch = batch.take(pa.array(unseen_batch_indices))  # type: ignore[arg-type]
+                if len(deduped_batch) > 0:
+                    yield deduped_batch
 
     def read_dataframes_iter(
         self,
@@ -513,13 +579,14 @@ def read_transformed_records_iter(
     ) -> Iterator[dict]:
         """Yield individual transformed records as dictionaries from the dataset.
 
-        If 'transformed_record' is None (i.e., action="skip"|"error"), the yield
-        statement will not be executed for the row.
+        If 'transformed_record' is None (common scenarios are action="skip"|"error"), the
+        yield statement will not be executed for the row.  Note that for action="delete" a
+        transformed record still may be yielded if present.
 
         Args: see self.read_batches_iter()
         """
         for record_dict in self.read_dicts_iter(
-            columns=["transformed_record"],
+            columns=["timdex_record_id", "transformed_record"],
             **filters,
         ):
             if transformed_record := record_dict["transformed_record"]:
diff --git a/timdex_dataset_api/run.py b/timdex_dataset_api/run.py
@@ -85,7 +85,22 @@ def get_runs_metadata(self, *, refresh: bool = False) -> pd.DataFrame:
         )
         return grouped_runs_df
 
-    def get_current_source_parquet_files(self, source: str) -> list[str]:
+    def get_current_parquet_files(self, source: str | None = None) -> list[str]:
+        """Get reverse chronological list of parquet files associated with current runs.
+
+        Args:
+            source: if provided, limits parquet files to only that source
+        """
+        runs_df = self.get_runs_metadata()  # run metadata is cached for future calls
+        sources = [source] if source else list(runs_df.source.unique())
+
+        source_parquet_files = []
+        for _source in sources:
+            source_parquet_files.extend(self._get_current_source_parquet_files(_source))
+
+        return source_parquet_files
+
+    def _get_current_source_parquet_files(self, source: str) -> list[str]:
         """Get reverse chronological list of current parquet files for a source.
 
         Args:
@@ -166,8 +181,9 @@ def _parse_run_metadata_from_parquet_file(self, parquet_filepath: str) -> dict:
         """
         parquet_file = pq.ParquetFile(
             parquet_filepath,
-            filesystem=self.timdex_dataset.filesystem,  # type: ignore[union-attr]
+            filesystem=self.timdex_dataset.filesystem,
         )
+
         file_meta = parquet_file.metadata.to_dict()
         num_rows = file_meta["num_rows"]
         columns_meta = file_meta["row_groups"][0]["columns"]  # type: ignore[typeddict-item]