Skip to content

Commit abc2576

Browse files
committed
Reviewed PR changes
1 parent ad4ee2b commit abc2576

1 file changed

Lines changed: 43 additions & 21 deletions

File tree

scripts/1-fetch/museums_victoria_fetch.py

Lines changed: 43 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
from collections import defaultdict
1414

1515
# Third-party
16+
from operator import itemgetter
1617
import requests
1718
from pygments import highlight
1819
from pygments.formatters import TerminalFormatter
@@ -43,7 +44,6 @@
4344
HEADER1_COUNT = ["TOOL IDENTIFIER", "COUNT"]
4445
HEADER2_MEDIA = ["TOOL IDENTIFIER", "MEDIA TYPE", "COUNT"]
4546
HEADER3_RECORD = ["TOOL IDENTIFIER", "RECORD TYPE", "COUNT"]
46-
MAX_PER_PAGE = 100 # Pagination limit as defined by the API documentation
4747
QUARTER = os.path.basename(PATHS["data_quarter"])
4848
RECORD_TYPES = [
4949
"article",
@@ -69,6 +69,12 @@ def parse_arguments():
6969
action="store_true",
7070
help="Enable git actions (fetch, merge, add, commit, and push)",
7171
)
72+
parser.add_argument(
73+
"--limit",
74+
type=int,
75+
default=None,
76+
help="Maximum number of records to fetch per each record type",
77+
)
7278
args = parser.parse_args()
7379
if not args.enable_save and args.enable_git:
7480
parser.error("--enable-git requires --enable-save")
@@ -91,13 +97,6 @@ def get_requests_session():
9197
return session
9298

9399

94-
def sanitize_string(s):
95-
"""Replaces newline and carriage return characters with a space."""
96-
if isinstance(s, str):
97-
return s.replace("\n", " ").replace("\r", "")
98-
return s
99-
100-
101100
def initialize_data_file(file_path, header):
102101
if not os.path.isfile(file_path):
103102
with open(file_path, "w", encoding="utf-8", newline="\n") as file_obj:
@@ -163,7 +162,7 @@ def write_counts_to_csv(args, data: dict):
163162
writer.writerows(rows)
164163

165164

166-
def fetch_museums_victoria_data(session):
165+
def fetch_museums_victoria_data(args, session):
167166
"""
168167
Fetches all records with images from the Museums Victoria API by iterating
169168
through all record types and handling pagination.
@@ -172,23 +171,33 @@ def fetch_museums_victoria_data(session):
172171
record_counts = defaultdict(lambda: defaultdict(int))
173172
media_counts = defaultdict(lambda: defaultdict(int))
174173
licences_count = defaultdict(int)
174+
total_records_processed = 0
175175

176176
# Iterate through each record type
177177
for record_type in RECORD_TYPES:
178178
current_page = 1
179179
total_pages = None
180+
per_page = 100
181+
if args.limit is not None:
182+
per_page = args.limit
183+
if total_records_processed >= args.limit:
184+
LOGGER.info(
185+
f"Limit Reached: {total_records_processed} processed. Skipping remaining record types."
186+
)
187+
break
180188

181-
LOGGER.info(f"--- Starting fetch for: {record_type.upper()} ---")
182-
189+
LOGGER.info(
190+
f"fetching page {current_page} of {record_type}s "
191+
f"(records {(current_page * per_page) - per_page}-"
192+
f"{current_page * per_page})"
193+
)
183194
while True:
184195
# 1. Construct the API query parameters
185196
params = {
186-
"recordtype": record_type,
187-
# "perpage": 20,
188-
"perpage": MAX_PER_PAGE,
189-
"page": current_page,
190-
# "page": 1,
191197
"envelope": "true",
198+
"page": current_page,
199+
"perpage": per_page,
200+
"recordtype": record_type,
192201
}
193202
try:
194203
r = session.get(BASE_URL, params=params, timeout=30)
@@ -202,6 +211,7 @@ def fetch_museums_victoria_data(session):
202211
data = r.json()
203212
results = data.get("response", [])
204213
for res in results:
214+
total_records_processed += 1
205215
media_list = res.get("media", [])
206216
for media_item in media_list:
207217
licence_data = media_item.get("licence")
@@ -219,26 +229,38 @@ def fetch_museums_victoria_data(session):
219229
record_counts[record_type][license_short_name] += 1
220230
if total_pages is None:
221231
headers = data.get("headers", {})
222-
# total_pages = 1
223232
total_pages = int(headers.get("totalResults", "0"))
224233

234+
if args.limit is not None and total_records_processed >= per_page:
235+
break
225236
current_page += 1
237+
226238
if current_page > total_pages:
227239
break
240+
228241
return {
229-
FILE1_COUNT: licences_count,
230-
FILE2_MEDIA: media_counts,
231-
FILE3_RECORD: record_counts,
242+
FILE1_COUNT: dict(sorted(licences_count.items())),
243+
FILE2_MEDIA: sort_nested_defaultdict(media_counts),
244+
FILE3_RECORD: sort_nested_defaultdict(record_counts),
232245
}
233246

247+
def sort_nested_defaultdict(d):
248+
"""Convert defaultdicts to regular dicts and sort all keys recursively."""
249+
if isinstance(d, defaultdict):
250+
d = {k: sort_nested_defaultdict(v) for k, v in sorted(d.items())}
251+
elif isinstance(d, dict):
252+
d = {k: sort_nested_defaultdict(v) for k, v in sorted(d.items())}
253+
return d
254+
255+
234256

235257
def main():
236258
args = parse_arguments()
237259
shared.paths_log(LOGGER, PATHS)
238260
shared.git_fetch_and_merge(args, PATHS["repo"])
239261
initialize_all_data_files(args)
240262
session = get_requests_session()
241-
data = fetch_museums_victoria_data(session)
263+
data = fetch_museums_victoria_data(args, session)
242264
write_counts_to_csv(args, data)
243265
args = shared.git_add_and_commit(
244266
args,

0 commit comments

Comments
 (0)