|
| 1 | +# ruff: noqa: PLR2004, PD901 |
| 2 | + |
| 3 | +import pandas as pd |
| 4 | +import pyarrow as pa |
| 5 | +import pytest |
| 6 | + |
| 7 | +DATASET_COLUMNS_SET = { |
| 8 | + "timdex_record_id", |
| 9 | + "source_record", |
| 10 | + "transformed_record", |
| 11 | + "source", |
| 12 | + "run_date", |
| 13 | + "run_type", |
| 14 | + "run_id", |
| 15 | + "action", |
| 16 | + "year", |
| 17 | + "month", |
| 18 | + "day", |
| 19 | +} |
| 20 | + |
| 21 | + |
| 22 | +def test_read_batches_yields_pyarrow_record_batches(fixed_local_dataset): |
| 23 | + batches = fixed_local_dataset.read_batches_iter() |
| 24 | + batch = next(batches) |
| 25 | + assert isinstance(batch, pa.RecordBatch) |
| 26 | + |
| 27 | + |
| 28 | +def test_read_batches_all_columns_by_default(fixed_local_dataset): |
| 29 | + batches = fixed_local_dataset.read_batches_iter() |
| 30 | + batch = next(batches) |
| 31 | + assert set(batch.column_names) == DATASET_COLUMNS_SET |
| 32 | + |
| 33 | + |
| 34 | +def test_read_batches_filter_columns(fixed_local_dataset): |
| 35 | + columns_subset = ["source", "transformed_record"] |
| 36 | + batches = fixed_local_dataset.read_batches_iter(columns=columns_subset) |
| 37 | + batch = next(batches) |
| 38 | + assert set(batch.column_names) == set(columns_subset) |
| 39 | + |
| 40 | + |
| 41 | +def test_read_batches_no_filters_gets_full_dataset(fixed_local_dataset): |
| 42 | + batches = fixed_local_dataset.read_batches_iter() |
| 43 | + table = pa.Table.from_batches(batches) |
| 44 | + assert len(table) == fixed_local_dataset.row_count |
| 45 | + |
| 46 | + |
| 47 | +def test_read_batches_with_filters_gets_subset_of_dataset(fixed_local_dataset): |
| 48 | + batches = fixed_local_dataset.read_batches_iter( |
| 49 | + source="libguides", |
| 50 | + run_date="2024-12-01", |
| 51 | + run_type="daily", |
| 52 | + action="index", |
| 53 | + ) |
| 54 | + |
| 55 | + table = pa.Table.from_batches(batches) |
| 56 | + assert len(table) == 1_000 |
| 57 | + assert len(table) < fixed_local_dataset.row_count |
| 58 | + |
| 59 | + # assert loaded dataset is unchanged by filtering for a read method |
| 60 | + assert fixed_local_dataset.row_count == 5_000 |
| 61 | + |
| 62 | + |
| 63 | +def test_read_dataframe_batches_yields_dataframes(fixed_local_dataset): |
| 64 | + df_iter = fixed_local_dataset.read_dataframes_iter() |
| 65 | + df_batch = next(df_iter) |
| 66 | + assert isinstance(df_batch, pd.DataFrame) |
| 67 | + assert len(df_batch) == 1_000 |
| 68 | + |
| 69 | + |
| 70 | +def test_read_dataframe_reads_all_dataset_rows_after_filtering(fixed_local_dataset): |
| 71 | + df = fixed_local_dataset.read_dataframe() |
| 72 | + assert isinstance(df, pd.DataFrame) |
| 73 | + assert len(df) == fixed_local_dataset.row_count |
| 74 | + |
| 75 | + |
| 76 | +def test_read_dicts_yields_dictionary_for_each_dataset_record(fixed_local_dataset): |
| 77 | + records = fixed_local_dataset.read_dicts_iter() |
| 78 | + record = next(records) |
| 79 | + assert isinstance(record, dict) |
| 80 | + assert set(record.keys()) == DATASET_COLUMNS_SET |
| 81 | + |
| 82 | + |
| 83 | +def test_read_batches_filter_to_none_returns_empty_list(fixed_local_dataset): |
| 84 | + batches = fixed_local_dataset.read_batches_iter(source="not-gonna-find-me") |
| 85 | + assert list(batches) == [] |
| 86 | + |
| 87 | + |
| 88 | +def test_read_dicts_filter_to_none_stopiteration_immediately(fixed_local_dataset): |
| 89 | + batches = fixed_local_dataset.read_dicts_iter(source="not-gonna-find-me") |
| 90 | + with pytest.raises(StopIteration): |
| 91 | + next(batches) |
0 commit comments