Skip to content

Commit 07f75ee

Browse files
authored
Merge pull request #68 from MITLibraries/TIMX-427-improve-logging
TIMX 427 - improve logging
2 parents 9e807e9 + 12bc90d commit 07f75ee

6 files changed

Lines changed: 90 additions & 109 deletions

File tree

Lines changed: 3 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -4,19 +4,9 @@
44
import pyarrow as pa
55
import pytest
66

7-
DATASET_COLUMNS_SET = {
8-
"timdex_record_id",
9-
"source_record",
10-
"transformed_record",
11-
"source",
12-
"run_date",
13-
"run_type",
14-
"run_id",
15-
"action",
16-
"year",
17-
"month",
18-
"day",
19-
}
7+
from timdex_dataset_api.dataset import TIMDEX_DATASET_SCHEMA
8+
9+
DATASET_COLUMNS_SET = set(TIMDEX_DATASET_SCHEMA.names)
2010

2111

2212
def test_read_batches_yields_pyarrow_record_batches(fixed_local_dataset):

tests/test_records.py

Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
import re
2+
from datetime import date
3+
4+
import pytest
5+
6+
from timdex_dataset_api.record import DatasetRecord
7+
8+
9+
def test_dataset_record_init_with_valid_run_date_parses_year_month_day():
10+
values = {
11+
"timdex_record_id": "alma:123",
12+
"source_record": b"<record><title>Hello World.</title></record>",
13+
"transformed_record": b"""{"title":["Hello World."]}""",
14+
"source": "libguides",
15+
"run_date": "2024-12-01",
16+
"run_type": "full",
17+
"action": "index",
18+
"run_id": "000-111-aaa-bbb",
19+
}
20+
record = DatasetRecord(**values)
21+
22+
assert record
23+
assert (record.year, record.month, record.day) == (
24+
"2024",
25+
"12",
26+
"01",
27+
)
28+
29+
30+
def test_dataset_record_init_with_invalid_run_date_raise_error():
31+
values = {
32+
"timdex_record_id": "alma:123",
33+
"source_record": b"<record><title>Hello World.</title></record>",
34+
"transformed_record": b"""{"title":["Hello World."]}""",
35+
"source": "libguides",
36+
"run_date": "-12-01",
37+
"run_type": "full",
38+
"action": "index",
39+
"run_id": "000-111-aaa-bbb",
40+
}
41+
42+
with pytest.raises(
43+
ValueError, match=re.escape("time data '-12-01' does not match format '%Y-%m-%d'")
44+
):
45+
DatasetRecord(**values)
46+
47+
48+
def test_dataset_record_serialization():
49+
values = {
50+
"timdex_record_id": "alma:123",
51+
"source_record": b"<record><title>Hello World.</title></record>",
52+
"transformed_record": b"""{"title":["Hello World."]}""",
53+
"source": "libguides",
54+
"run_date": "2024-12-01",
55+
"run_type": "full",
56+
"action": "index",
57+
"run_id": "abc123",
58+
}
59+
dataset_record = DatasetRecord(**values)
60+
61+
assert dataset_record.to_dict() == {
62+
"timdex_record_id": "alma:123",
63+
"source_record": b"<record><title>Hello World.</title></record>",
64+
"transformed_record": b"""{"title":["Hello World."]}""",
65+
"source": "libguides",
66+
"run_date": date(2024, 12, 1),
67+
"run_type": "full",
68+
"action": "index",
69+
"run_id": "abc123",
70+
"year": "2024",
71+
"month": "12",
72+
"day": "01",
73+
}
Lines changed: 1 addition & 71 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,6 @@
11
# ruff: noqa: S105, S106, SLF001, PLR2004, PD901, D209, D205
22
import math
33
import os
4-
import re
5-
from datetime import date
64
from unittest.mock import patch
75

86
import pyarrow.dataset as ds
@@ -14,74 +12,6 @@
1412
TIMDEX_DATASET_SCHEMA,
1513
TIMDEXDataset,
1614
)
17-
from timdex_dataset_api.record import DatasetRecord
18-
19-
20-
def test_dataset_record_init():
21-
values = {
22-
"timdex_record_id": "alma:123",
23-
"source_record": b"<record><title>Hello World.</title></record>",
24-
"transformed_record": b"""{"title":["Hello World."]}""",
25-
"source": "libguides",
26-
"run_date": "2024-12-01",
27-
"run_type": "full",
28-
"action": "index",
29-
"run_id": "000-111-aaa-bbb",
30-
}
31-
record = DatasetRecord(**values)
32-
33-
assert record
34-
assert (record.year, record.month, record.day) == (
35-
"2024",
36-
"12",
37-
"01",
38-
)
39-
40-
41-
def test_dataset_record_init_with_invalid_run_date_raise_error():
42-
values = {
43-
"timdex_record_id": "alma:123",
44-
"source_record": b"<record><title>Hello World.</title></record>",
45-
"transformed_record": b"""{"title":["Hello World."]}""",
46-
"source": "libguides",
47-
"run_date": "-12-01",
48-
"run_type": "full",
49-
"action": "index",
50-
"run_id": "000-111-aaa-bbb",
51-
}
52-
53-
with pytest.raises(
54-
ValueError, match=re.escape("time data '-12-01' does not match format '%Y-%m-%d'")
55-
):
56-
DatasetRecord(**values)
57-
58-
59-
def test_dataset_record_serialization():
60-
values = {
61-
"timdex_record_id": "alma:123",
62-
"source_record": b"<record><title>Hello World.</title></record>",
63-
"transformed_record": b"""{"title":["Hello World."]}""",
64-
"source": "libguides",
65-
"run_date": "2024-12-01",
66-
"run_type": "full",
67-
"action": "index",
68-
"run_id": "abc123",
69-
}
70-
dataset_record = DatasetRecord(**values)
71-
72-
assert dataset_record.to_dict() == {
73-
"timdex_record_id": "alma:123",
74-
"source_record": b"<record><title>Hello World.</title></record>",
75-
"transformed_record": b"""{"title":["Hello World."]}""",
76-
"source": "libguides",
77-
"run_date": date(2024, 12, 1),
78-
"run_type": "full",
79-
"action": "index",
80-
"run_id": "abc123",
81-
"year": "2024",
82-
"month": "12",
83-
"day": "01",
84-
}
8515

8616

8717
def test_dataset_write_records_to_new_local_dataset(
@@ -115,7 +45,7 @@ def test_dataset_write_record_batches_uses_batch_size(
11545
total_records = 101
11646
batch_size = 50
11747
batches = list(
118-
new_local_dataset.get_dataset_record_batches(
48+
new_local_dataset.create_record_batches(
11949
sample_records_iter(total_records), batch_size=batch_size
12050
)
12151
)

timdex_dataset_api/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
from timdex_dataset_api.dataset import TIMDEXDataset
44
from timdex_dataset_api.record import DatasetRecord
55

6-
__version__ = "0.6.0"
6+
__version__ = "0.7.0"
77

88
__all__ = [
99
"DatasetRecord",

timdex_dataset_api/config.py

Lines changed: 9 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -3,29 +3,21 @@
33

44

55
def configure_logger(name: str) -> logging.Logger:
6-
"""Prepares and returns a logger instance for a given module name.
6+
"""Prepares a logger instance.
77
8-
This approach is suitable for an installed and imported library such as this, where
9-
any calling application logging levels and handlers should be utilized.
8+
If the env var TDA_LOG_LEVEL is set, the logging level will override the logging
9+
level of the calling context.
1010
1111
Args:
1212
name (str): The name of the logger, typically __name__ is passed by caller
1313
"""
1414
logger = logging.getLogger(name)
15-
logger.addHandler(logging.NullHandler())
1615

17-
log_level = os.getenv("TDA_LOG_LEVEL", "INFO").strip().upper()
18-
if log_level not in ["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"]:
19-
raise ValueError(f"Invalid log level: '{log_level}'")
20-
logger.setLevel(getattr(logging, log_level))
21-
22-
handler = logging.StreamHandler()
23-
handler.setFormatter(
24-
logging.Formatter(
25-
"%(asctime)s %(levelname)s %(name)s.%(funcName)s() "
26-
"line %(lineno)d: %(message)s"
27-
)
28-
)
29-
logger.addHandler(handler)
16+
# set logger level if env var 'TDA_LOG_LEVEL' is set
17+
if log_level := os.getenv("TDA_LOG_LEVEL"):
18+
log_level = log_level.strip().upper()
19+
if log_level not in logging.getLevelNamesMapping():
20+
raise ValueError(f"Invalid log level: '{log_level}'")
21+
logger.setLevel(getattr(logging, log_level))
3022

3123
return logger

timdex_dataset_api/dataset.py

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -316,7 +316,7 @@ def write(
316316
"Dataset location must be the root of a single dataset for writing"
317317
)
318318

319-
record_batches_iter = self.get_dataset_record_batches(
319+
record_batches_iter = self.create_record_batches(
320320
records_iter,
321321
batch_size=batch_size,
322322
)
@@ -341,7 +341,7 @@ def write(
341341
self.log_write_statistics(start_time)
342342
return self._written_files # type: ignore[return-value]
343343

344-
def get_dataset_record_batches(
344+
def create_record_batches(
345345
self,
346346
records_iter: Iterator["DatasetRecord"],
347347
*,
@@ -360,14 +360,10 @@ def get_dataset_record_batches(
360360
group size in final parquet files
361361
"""
362362
for i, record_batch in enumerate(itertools.batched(records_iter, batch_size)):
363-
batch_start_time = time.perf_counter()
364363
batch = pa.RecordBatch.from_pylist(
365364
[record.to_dict() for record in record_batch]
366365
)
367-
logger.debug(
368-
f"Batch {i + 1} yielded for writing, "
369-
f"elapsed: {round(time.perf_counter()-batch_start_time, 6)}s"
370-
)
366+
logger.debug(f"Yielding batch {i+1} for dataset writing.")
371367
yield batch
372368

373369
def log_write_statistics(self, start_time: float) -> None:

0 commit comments

Comments
 (0)