Merge pull request #108 from MITLibraries/TIMX-468-read-configs

ghukill · web-flow · commit bd408dcc1a0b · 2025-02-06T15:37:03.000-05:00
TIMX 468 - surface pyarrow read configs to address memory footprint
diff --git a/tests/test_dataset.py b/tests/test_dataset.py
@@ -1,4 +1,4 @@
-# ruff: noqa: S105, S106, SLF001
+# ruff: noqa: S105, S106, SLF001, PLR2004
 import os
 from datetime import date
 from unittest.mock import MagicMock, patch
@@ -7,7 +7,11 @@
 import pytest
 from pyarrow import fs
 
-from timdex_dataset_api.dataset import DatasetNotLoadedError, TIMDEXDataset
+from timdex_dataset_api.dataset import (
+    DatasetNotLoadedError,
+    TIMDEXDataset,
+    TIMDEXDatasetConfig,
+)
 
 
 @pytest.mark.parametrize(
@@ -23,6 +27,24 @@ def test_dataset_init_success(location, expected_file_system, expected_source):
     assert timdex_dataset.source == expected_source
 
 
+def test_dataset_init_env_vars_set_config(monkeypatch, local_dataset_location):
+    default_timdex_dataset = TIMDEXDataset(location=local_dataset_location)
+    default_read_batch_config = default_timdex_dataset.config.read_batch_size
+    assert default_read_batch_config == 1_000
+
+    monkeypatch.setenv("TDA_READ_BATCH_SIZE", "100_000")
+    env_var_timdex_dataset = TIMDEXDataset(location=local_dataset_location)
+    env_var_read_batch_config = env_var_timdex_dataset.config.read_batch_size
+    assert env_var_read_batch_config == 100_000
+
+
+def test_dataset_init_custom_config_object(monkeypatch, local_dataset_location):
+    config = TIMDEXDatasetConfig()
+    config.max_rows_per_file = 42
+    timdex_dataset = TIMDEXDataset(location=local_dataset_location, config=config)
+    assert timdex_dataset.config.max_rows_per_file == 42
+
+
 @patch("timdex_dataset_api.dataset.fs.LocalFileSystem")
 @patch("timdex_dataset_api.dataset.ds.dataset")
 def test_dataset_load_local_sets_filesystem_and_dataset_success(
@@ -73,28 +95,28 @@ def test_dataset_load_without_filters_success(fixed_local_dataset):
     fixed_local_dataset.load()
 
     assert os.path.exists(fixed_local_dataset.location)
-    assert fixed_local_dataset.row_count == 5_000  # noqa: PLR2004
+    assert fixed_local_dataset.row_count == 5_000
 
 
 def test_dataset_load_with_run_date_str_filters_success(fixed_local_dataset):
     fixed_local_dataset.load(run_date="2024-12-01")
 
     assert os.path.exists(fixed_local_dataset.location)
-    assert fixed_local_dataset.row_count == 5_000  # noqa: PLR2004
+    assert fixed_local_dataset.row_count == 5_000
 
 
 def test_dataset_load_with_run_date_obj_filters_success(fixed_local_dataset):
     fixed_local_dataset.load(run_date=date(2024, 12, 1))
 
     assert os.path.exists(fixed_local_dataset.location)
-    assert fixed_local_dataset.row_count == 5_000  # noqa: PLR2004
+    assert fixed_local_dataset.row_count == 5_000
 
 
 def test_dataset_load_with_ymd_filters_success(fixed_local_dataset):
     fixed_local_dataset.load(year="2024", month="12", day="01")
 
     assert os.path.exists(fixed_local_dataset.location)
-    assert fixed_local_dataset.row_count == 5_000  # noqa: PLR2004
+    assert fixed_local_dataset.row_count == 5_000
 
 
 def test_dataset_load_with_single_nonpartition_filters_success(fixed_local_dataset):
@@ -158,7 +180,7 @@ def test_dataset_get_filtered_dataset_with_or_nonpartition_filters_success(
         timdex_record_id=["alma:0", "alma:1"]
     )
     filtered_local_df = filtered_local_dataset.to_table().to_pandas()
-    assert len(filtered_local_df) == 2  # noqa: PLR2004
+    assert len(filtered_local_df) == 2
     assert filtered_local_df["timdex_record_id"].tolist() == ["alma:0", "alma:1"]
 
 
diff --git a/tests/test_write.py b/tests/test_write.py
@@ -8,7 +8,6 @@
 
 from tests.utils import generate_sample_records
 from timdex_dataset_api.dataset import (
-    MAX_ROWS_PER_FILE,
     TIMDEX_DATASET_SCHEMA,
     TIMDEXDataset,
 )
@@ -28,28 +27,29 @@ def test_dataset_write_records_to_new_local_dataset(
 def test_dataset_write_default_max_rows_per_file(new_local_dataset, sample_records_iter):
     """Default is 100k rows per file, therefore writing 200,033 records should result in
     3 files (x2 @ 100k rows, x1 @ 33 rows)."""
+    default_max_rows_per_file = new_local_dataset.config.max_rows_per_file
     total_records = 200_033
 
     new_local_dataset.write(sample_records_iter(total_records))
     new_local_dataset.load()
 
     assert new_local_dataset.row_count == total_records
     assert len(new_local_dataset.dataset.files) == math.ceil(
-        total_records / MAX_ROWS_PER_FILE
+        total_records / default_max_rows_per_file
     )
 
 
 def test_dataset_write_record_batches_uses_batch_size(
     new_local_dataset, sample_records_iter
 ):
     total_records = 101
-    batch_size = 50
+    new_local_dataset.config.write_batch_size = 50
     batches = list(
-        new_local_dataset.create_record_batches(
-            sample_records_iter(total_records), batch_size=batch_size
-        )
+        new_local_dataset.create_record_batches(sample_records_iter(total_records))
+    )
+    assert len(batches) == math.ceil(
+        total_records / new_local_dataset.config.write_batch_size
     )
-    assert len(batches) == math.ceil(total_records / batch_size)
 
 
 def test_dataset_write_to_multiple_locations_raise_error(sample_records_iter):
diff --git a/timdex_dataset_api/__init__.py b/timdex_dataset_api/__init__.py
@@ -3,7 +3,7 @@
 from timdex_dataset_api.dataset import TIMDEXDataset
 from timdex_dataset_api.record import DatasetRecord
 
-__version__ = "0.9.0"
+__version__ = "0.10.0"
 
 __all__ = [
     "DatasetRecord",
diff --git a/timdex_dataset_api/dataset.py b/timdex_dataset_api/dataset.py