Skip to content

Commit 47e74c3

Browse files
authored
Merge pull request #52 from MITLibraries/TIMX-417-read-from-dataset
TIMX 417 - read from dataset
2 parents 41b70e3 + 81cdccf commit 47e74c3

4 files changed

Lines changed: 220 additions & 114 deletions

File tree

tests/conftest.py

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,23 @@ def fixed_local_dataset(tmp_path) -> TIMDEXDataset:
4949
method.
5050
"""
5151
timdex_dataset = TIMDEXDataset(str(tmp_path / "fixed_local_dataset/"))
52-
timdex_dataset.write(generate_sample_records(num_records=5_000, run_id="abc123"))
52+
for source, run_id in [
53+
("alma", "abc123"),
54+
("dspace", "def456"),
55+
("aspace", "ghi789"),
56+
("libguides", "jkl123"),
57+
("gismit", "mno456"),
58+
]:
59+
timdex_dataset.write(
60+
generate_sample_records(
61+
num_records=1_000,
62+
timdex_record_id_prefix=source,
63+
source=source,
64+
run_date="2024-12-01",
65+
run_id=run_id,
66+
)
67+
)
68+
timdex_dataset.load()
5369
return timdex_dataset
5470

5571

tests/test_dataset_read.py

Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,91 @@
1+
# ruff: noqa: PLR2004, PD901
2+
3+
import pandas as pd
4+
import pyarrow as pa
5+
import pytest
6+
7+
DATASET_COLUMNS_SET = {
8+
"timdex_record_id",
9+
"source_record",
10+
"transformed_record",
11+
"source",
12+
"run_date",
13+
"run_type",
14+
"run_id",
15+
"action",
16+
"year",
17+
"month",
18+
"day",
19+
}
20+
21+
22+
def test_read_batches_yields_pyarrow_record_batches(fixed_local_dataset):
23+
batches = fixed_local_dataset.read_batches_iter()
24+
batch = next(batches)
25+
assert isinstance(batch, pa.RecordBatch)
26+
27+
28+
def test_read_batches_all_columns_by_default(fixed_local_dataset):
29+
batches = fixed_local_dataset.read_batches_iter()
30+
batch = next(batches)
31+
assert set(batch.column_names) == DATASET_COLUMNS_SET
32+
33+
34+
def test_read_batches_filter_columns(fixed_local_dataset):
35+
columns_subset = ["source", "transformed_record"]
36+
batches = fixed_local_dataset.read_batches_iter(columns=columns_subset)
37+
batch = next(batches)
38+
assert set(batch.column_names) == set(columns_subset)
39+
40+
41+
def test_read_batches_no_filters_gets_full_dataset(fixed_local_dataset):
42+
batches = fixed_local_dataset.read_batches_iter()
43+
table = pa.Table.from_batches(batches)
44+
assert len(table) == fixed_local_dataset.row_count
45+
46+
47+
def test_read_batches_with_filters_gets_subset_of_dataset(fixed_local_dataset):
48+
batches = fixed_local_dataset.read_batches_iter(
49+
source="libguides",
50+
run_date="2024-12-01",
51+
run_type="daily",
52+
action="index",
53+
)
54+
55+
table = pa.Table.from_batches(batches)
56+
assert len(table) == 1_000
57+
assert len(table) < fixed_local_dataset.row_count
58+
59+
# assert loaded dataset is unchanged by filtering for a read method
60+
assert fixed_local_dataset.row_count == 5_000
61+
62+
63+
def test_read_dataframe_batches_yields_dataframes(fixed_local_dataset):
64+
df_iter = fixed_local_dataset.read_dataframes_iter()
65+
df_batch = next(df_iter)
66+
assert isinstance(df_batch, pd.DataFrame)
67+
assert len(df_batch) == 1_000
68+
69+
70+
def test_read_dataframe_reads_all_dataset_rows_after_filtering(fixed_local_dataset):
71+
df = fixed_local_dataset.read_dataframe()
72+
assert isinstance(df, pd.DataFrame)
73+
assert len(df) == fixed_local_dataset.row_count
74+
75+
76+
def test_read_dicts_yields_dictionary_for_each_dataset_record(fixed_local_dataset):
77+
records = fixed_local_dataset.read_dicts_iter()
78+
record = next(records)
79+
assert isinstance(record, dict)
80+
assert set(record.keys()) == DATASET_COLUMNS_SET
81+
82+
83+
def test_read_batches_filter_to_none_returns_empty_list(fixed_local_dataset):
84+
batches = fixed_local_dataset.read_batches_iter(source="not-gonna-find-me")
85+
assert list(batches) == []
86+
87+
88+
def test_read_dicts_filter_to_none_stopiteration_immediately(fixed_local_dataset):
89+
batches = fixed_local_dataset.read_dicts_iter(source="not-gonna-find-me")
90+
with pytest.raises(StopIteration):
91+
next(batches)

timdex_dataset_api/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
from timdex_dataset_api.dataset import TIMDEXDataset
44
from timdex_dataset_api.record import DatasetRecord
55

6-
__version__ = "0.4.0"
6+
__version__ = "0.5.0"
77

88
__all__ = [
99
"DatasetRecord",

0 commit comments

Comments
 (0)