Remove option to load dataset using partition prefix

jonavellecuerdo · jonavellecuerdo · commit 3e5e7079c80e · 2024-12-20T15:24:04.000-05:00
Why these changes are being introduced: * When a partition prefix is constructed for the dataset with year, month, and day (or a combination of them), and appended to the 'source' arg in ds.dataset, the resulting dataset will contain 'None' values for any partition columns used in the prefix. It is then problematic with the "post-load" filtering step, which again attempts to filter on those partition columns. It was believed that would be somewhat inconsequential, but additional testing revealed this was not the case. These changes simplify the TIMDEXDataset.load method by instead relying on PyArrow's efficient dataset discovery and reading processes. For more details, see comment on PR #31: #31 (review). How this addresses that need: * Remove '_get_partition_prefixes' private method * Update '_parse_date_filters' to raise TypeError * Update unit tests Side effects of this change: * None Relevant ticket(s): * https://mitlibraries.atlassian.net/browse/TIMX-425
diff --git a/pyproject.toml b/pyproject.toml
@@ -22,6 +22,7 @@ classifiers = [
 ]
 
 dependencies = [
+    "attrs",
     "boto3",
     "duckdb",
     "pandas",
diff --git a/tests/test_dataset.py b/tests/test_dataset.py
@@ -35,7 +35,7 @@ def test_dataset_load_local_sets_filesystem_and_dataset_success(
     result = timdex_dataset.load()
 
     mock_pyarrow_ds.assert_called_once_with(
-        "local/path/to/dataset/",
+        "local/path/to/dataset",
         schema=timdex_dataset.schema,
         format="parquet",
         partitioning="hive",
@@ -59,7 +59,7 @@ def test_dataset_load_s3_sets_filesystem_and_dataset_success(
 
     mock_get_s3_fs.assert_called_once()
     mock_pyarrow_ds.assert_called_once_with(
-        "bucket/path/to/dataset/",
+        "bucket/path/to/dataset",
         schema=timdex_dataset.schema,
         format="parquet",
         partitioning="hive",
@@ -69,60 +69,55 @@ def test_dataset_load_s3_sets_filesystem_and_dataset_success(
     assert result is None
 
 
-@patch("timdex_dataset_api.dataset.fs.LocalFileSystem")
-@patch("timdex_dataset_api.dataset.ds.dataset")
-def test_dataset_load_with_partition_prefix_via_run_date_success(
-    mock_pyarrow_ds, mock_local_fs
-):
-    mock_local_fs.return_value = MagicMock()
-    mock_pyarrow_ds.return_value = MagicMock()
+def test_dataset_load_without_filters_success(fixed_local_dataset):
+    fixed_local_dataset.load()
 
-    timdex_dataset = TIMDEXDataset(location="local/path/to/dataset")
-    timdex_dataset.load(run_date="2024-12-01")
+    assert os.path.exists(fixed_local_dataset.location)
+    assert fixed_local_dataset.row_count == 5_000  # noqa: PLR2004
 
-    mock_pyarrow_ds.assert_called_once_with(
-        "local/path/to/dataset/year=2024/month=12/day=01",
-        schema=timdex_dataset.schema,
-        format="parquet",
-        partitioning="hive",
-        filesystem=mock_local_fs.return_value,
-    )
 
+def test_dataset_load_with_run_date_str_filters_success(fixed_local_dataset):
+    fixed_local_dataset.load(run_date="2024-12-01")
 
-@patch("timdex_dataset_api.dataset.fs.LocalFileSystem")
-@patch("timdex_dataset_api.dataset.ds.dataset")
-def test_dataset_load_with_partition_prefix_via_run_date_components_success(
-    mock_pyarrow_ds, mock_local_fs
-):
-    mock_local_fs.return_value = MagicMock()
-    mock_pyarrow_ds.return_value = MagicMock()
+    assert os.path.exists(fixed_local_dataset.location)
+    assert fixed_local_dataset.row_count == 5_000  # noqa: PLR2004
 
-    timdex_dataset = TIMDEXDataset(location="local/path/to/dataset")
-    timdex_dataset.load(year="2024")
 
-    mock_pyarrow_ds.assert_called_once_with(
-        "local/path/to/dataset/year=2024",
-        schema=timdex_dataset.schema,
-        format="parquet",
-        partitioning="hive",
-        filesystem=mock_local_fs.return_value,
-    )
+def test_dataset_load_with_run_date_obj_filters_success(fixed_local_dataset):
+    fixed_local_dataset.load(run_date=date(2024, 12, 1))
+
+    assert os.path.exists(fixed_local_dataset.location)
+    assert fixed_local_dataset.row_count == 5_000  # noqa: PLR2004
 
 
-def test_dataset_load_no_filters_success(fixed_local_dataset):
-    fixed_local_dataset.load()
+def test_dataset_load_with_ymd_filters_success(fixed_local_dataset):
+    fixed_local_dataset.load(year="2024", month="12", day="01")
 
     assert os.path.exists(fixed_local_dataset.location)
     assert fixed_local_dataset.row_count == 5_000  # noqa: PLR2004
 
 
-def test_dataset_load_and_filter_by_non_partition_field_success(fixed_local_dataset):
+def test_dataset_load_with_single_nonpartition_filters_success(fixed_local_dataset):
     fixed_local_dataset.load(timdex_record_id="alma:0")
 
     assert fixed_local_dataset.row_count == 1
 
 
-def test_dataset_get_filtered_dataset_by_all_fields_success(fixed_local_dataset):
+def test_dataset_load_with_multi_nonpartition_filters_success(fixed_local_dataset):
+    fixed_local_dataset.load(
+        timdex_record_id="alma:0",
+        source="alma",
+        run_type="daily",
+        run_id="abc123",
+        action="index",
+    )
+
+    assert fixed_local_dataset.row_count == 1
+
+
+def test_dataset_get_filtered_dataset_with_multi_nonpartition_filters_success(
+    fixed_local_dataset,
+):
     fixed_local_dataset.load()  # initial load dataset, no filters passed
 
     filtered_local_dataset = fixed_local_dataset._get_filtered_dataset(
@@ -138,7 +133,9 @@ def test_dataset_get_filtered_dataset_by_all_fields_success(fixed_local_dataset)
     assert filtered_local_df["timdex_record_id"].iloc[0] == "alma:0"
 
 
-def test_dataset_get_filtered_dataset_by_single_fields_success(fixed_local_dataset):
+def test_dataset_get_filtered_dataset_with_single_nonpartition_success(
+    fixed_local_dataset,
+):
     fixed_local_dataset.load()  # initial load dataset, no filters passed
 
     filtered_local_dataset = fixed_local_dataset._get_filtered_dataset(
@@ -152,7 +149,7 @@ def test_dataset_get_filtered_dataset_by_single_fields_success(fixed_local_datas
     assert filtered_local_df["run_id"].unique() == ["abc123"]
 
 
-def test_dataset_get_filtered_dataset_by_run_date_str_successs(fixed_local_dataset):
+def test_dataset_get_filtered_dataset_with_run_date_str_successs(fixed_local_dataset):
     fixed_local_dataset.load()  # initial load dataset, no filters passed
 
     filtered_local_dataset = fixed_local_dataset._get_filtered_dataset(
@@ -166,7 +163,7 @@ def test_dataset_get_filtered_dataset_by_run_date_str_successs(fixed_local_datas
     assert empty_local_dataset.count_rows() == 0
 
 
-def test_dataset_get_filtered_dataset_by_run_date_date_success(fixed_local_dataset):
+def test_dataset_get_filtered_dataset_with_run_date_obj_success(fixed_local_dataset):
     fixed_local_dataset.load()  # initial load dataset, no filters passed
 
     filtered_local_dataset = fixed_local_dataset._get_filtered_dataset(
@@ -182,7 +179,7 @@ def test_dataset_get_filtered_dataset_by_run_date_date_success(fixed_local_datas
     assert empty_local_dataset.count_rows() == 0
 
 
-def test_dataset_get_filtered_dataset_by_run_date_components_success(fixed_local_dataset):
+def test_dataset_get_filtered_dataset_with_ymd_success(fixed_local_dataset):
     fixed_local_dataset.load()  # initial load dataset, no filters passed
 
     filtered_local_dataset = fixed_local_dataset._get_filtered_dataset(year="2024")
@@ -194,13 +191,13 @@ def test_dataset_get_filtered_dataset_by_run_date_components_success(fixed_local
     assert empty_local_dataset.count_rows() == 0
 
 
-def test_dataset_get_filtered_dataset_by_run_date_if_invalid_type_raise_error(
+def test_dataset_get_filtered_dataset_with_run_date_invalid_raise_error(
     fixed_local_dataset,
 ):
     fixed_local_dataset.load()  # initial load dataset, no filters passed
 
     with pytest.raises(
-        ValueError,
+        TypeError,
         match=(
             "Provided 'run_date' value must be a string matching format '%Y-%m-%d' "
             "or a datetime.date."
@@ -209,36 +206,6 @@ def test_dataset_get_filtered_dataset_by_run_date_if_invalid_type_raise_error(
         _ = fixed_local_dataset._get_filtered_dataset(run_date=999)
 
 
-def test_dataset_get_partition_prefixes_with_run_date_success():
-    timdex_dataset = TIMDEXDataset(location="s3://bucket/path/to/dataset")
-
-    assert (
-        timdex_dataset._get_partition_prefixes(run_date="2024-12-01")
-        == "year=2024/month=12/day=01"
-    )
-
-
-def test_dataset_get_partition_prefixes_without_run_date_success():
-    timdex_dataset = TIMDEXDataset(location="s3://bucket/path/to/dataset")
-
-    assert (
-        timdex_dataset._get_partition_prefixes(year="2024", month="12", day="01")
-    ) == "year=2024/month=12/day=01"
-    assert (
-        timdex_dataset._get_partition_prefixes(year="2024", month="12")
-        == "year=2024/month=12"
-    )
-    assert timdex_dataset._get_partition_prefixes(year="2024") == "year=2024"
-
-
-def test_dataset_get_partition_prefixes_without_run_date_raise_error():
-    timdex_dataset = TIMDEXDataset(location="s3://bucket/path/to/dataset")
-    with pytest.raises(
-        ValueError, match="Insufficient arguments to construct a valid partition prefix."
-    ):
-        assert timdex_dataset._get_partition_prefixes(month="12", day="01")
-
-
 def test_dataset_get_s3_filesystem_success(mocker):
     mocked_s3_filesystem = mocker.spy(fs, "S3FileSystem")
     s3_filesystem = TIMDEXDataset.get_s3_filesystem()
diff --git a/timdex_dataset_api/dataset.py b/timdex_dataset_api/dataset.py
@@ -2,7 +2,6 @@
 
 import itertools
 import operator
-import os
 import time
 import uuid
 from collections.abc import Iterator
@@ -110,9 +109,11 @@ def load(
         This method sets a pyarrow.dataset.Dataset to the TIMDEXDataset.dataset
         attribute. Loading comprises of two main steps:
 
-        - pre load: Append a partition prefix to self.source using either 'run_date'
-            or 'run_date' components to skip reading unecessary data partitions.
-        - post load: Lazily filter TIMDEXDataset.
+        - load: Lazily load full dataset. PyArrow will "discover" full dataset.
+            Note: This step may take a couple of seconds but leans on PyArrow's
+            parquet reading processes.
+        - filter: Lazily filter rows in the PyArrow dataset by conditions on
+            TIMDEX_DATASET_FILTER_COLUMNS.
 
         The dataset is loaded via the expected schema as defined by module constant
         TIMDEX_DATASET_SCHEMA.  If the target dataset differs in any way, errors may be
@@ -127,14 +128,9 @@ def load(
                 - month (str | None, optional)
                 - day (str | None, optional)
 
-                If 'run_date' is provided, partition prefixes are derived by parsing
-                'run_date' into its individual components.
-                    - 'run_date' str values must match date format "%Y-%m-%d".
-
-                If 'run_date' is not provided, partition prefixes are derived using
-                provided values for individual 'run_date' components: year, month, day.
-                See TIMDEXDataset.get_partition_prefix to see accepted combination
-                of args.
+                If 'run_date' is provided, partition filters are derived by parsing
+                a datetime.date object from the 'run_date' value and extracting
+                ymd values to use in filter expression.
 
             Non-partition columns
                 - timdex_record_id (str | None, optional)
@@ -148,22 +144,16 @@ def load(
         """
         start_time = time.perf_counter()
 
-        source_path = self.source
-        if isinstance(self.source, str):
-            source_path = os.path.join(
-                self.source, self._get_partition_prefixes(run_date, year, month, day)
-            )
-
-        # pre load: load dataset lazily, with an optional partition prefix
+        # lazy load full dataset
         self.dataset = ds.dataset(
-            source_path,
+            self.source,
             schema=self.schema,
             format="parquet",
             partitioning="hive",
             filesystem=self.filesystem,
         )
 
-        # post load: filter dataset
+        # filter dataset
         self.dataset = self._get_filtered_dataset(
             timdex_record_id=timdex_record_id,
             source=source,
@@ -252,7 +242,8 @@ def _get_filtered_dataset(
         )
 
         # get filters for partition columns ('run_date' or 'run_date' components)
-        filters_dict.update(self._parse_date_filters(run_date))
+        if run_date:
+            filters_dict.update(self._parse_date_filters(run_date))
 
         # create filter expressions for element-wise equality comparisons
         expressions = []
@@ -273,78 +264,38 @@ def _get_filtered_dataset(
 
         return self.dataset.filter(combined_expressions)
 
-    def _get_partition_prefixes(
-        self,
-        run_date: str | date | None = None,
-        year: str | None = None,
-        month: str | None = None,
-        day: str | None = None,
-    ) -> str:
-        """Derive partition prefixes from provided 'run_date' or 'run_date' components.
-
-        Argument 'run_date' is a date string formatted as "YYYY-MM-DD". If not provided,
-        the arguments 'year', 'month', and 'date' (also string values) must be provided
-        in specific combinations:
+    def _parse_date_filters(self, run_date: str | date | None) -> dict:
+        """Parse date filters from 'run_date'.
 
-            - year, month, day
-            - year, month
-            - year
+        Args:
+            run_date (str | date | None): If str, the value must match the
+                date format "%Y-%m-%d"; if date, ymd values are extracted
+                as str.
 
-            Any other combinations are insufficient to construct a valid partition prefix.
+        Raises:
+            TypeError: Raised when 'run_date' is an invalid type.
+            ValueError: Raised when either a datetime.date object cannot be parsed
+                from a provided 'run_date' str.
 
-        Returns a string of partition prefixes: "year=2024/month=12/day=01".
+        Returns:
+            dict: 'run_date' filters.
         """
-        if run_date:
-            run_date_filters = self._parse_date_filters(run_date)
-            return (
-                f"year={run_date_filters["year"]}/"
-                f"month={run_date_filters["month"]}/"
-                f"day={run_date_filters["day"]}"
-            )
-
-        partition_prefixes = []
-        if year and month and day:
-            partition_prefixes.extend([year, month, day])
-        elif year and month and day is None:
-            partition_prefixes.extend([year, month])
-        elif year and month is None and day is None:
-            partition_prefixes.extend([year])
-        elif year is None and month is None and day is None:
-            return ""
+        if isinstance(run_date, str):
+            run_date_obj = strict_date_parse(run_date)
+        elif isinstance(run_date, date):
+            run_date_obj = run_date
         else:
-            raise ValueError(
-                "Insufficient arguments to construct a valid partition prefix."
+            raise TypeError(
+                "Provided 'run_date' value must be a string matching format "
+                "'%Y-%m-%d' or a datetime.date."
             )
 
-        partition_prefixes_dict = dict(
-            zip(TIMDEX_DATASET_PARTITION_COLUMNS, partition_prefixes, strict=False),
-        )
-        return "/".join(
-            f"{partition_column}={partition_value}"
-            for partition_column, partition_value in partition_prefixes_dict.items()
-        )
-
-    def _parse_date_filters(self, run_date: str | date | None) -> dict:
-        date_filters = {}
-        if run_date is not None:
-            if isinstance(run_date, str):
-                run_date_obj = strict_date_parse(run_date)
-            elif isinstance(run_date, date):
-                run_date_obj = run_date
-            else:
-                raise ValueError(
-                    "Provided 'run_date' value must be a string matching format "
-                    "'%Y-%m-%d' or a datetime.date."
-                )
-            date_filters.update(
-                {
-                    "run_date": run_date_obj,
-                    "year": run_date_obj.strftime("%Y"),
-                    "month": run_date_obj.strftime("%m"),
-                    "day": run_date_obj.strftime("%d"),
-                }
-            )
-        return date_filters
+        return {
+            "run_date": run_date_obj,
+            "year": run_date_obj.strftime("%Y"),
+            "month": run_date_obj.strftime("%m"),
+            "day": run_date_obj.strftime("%d"),
+        }
 
     @staticmethod
     def get_s3_filesystem() -> fs.FileSystem:

Original file line number	Diff line number	Diff line change
`@@ -22,6 +22,7 @@ classifiers = [`
`22`	`22`	`]`
`23`	`23`
`24`	`24`	`dependencies = [`
	`25`	`+ "attrs",`
`25`	`26`	`"boto3",`
`26`	`27`	`"duckdb",`
`27`	`28`	`"pandas",`