Skip to content

Commit 4fd4920

Browse files
committed
Uses prefetched facets
1 parent 6893d82 commit 4fd4920

1 file changed

Lines changed: 95 additions & 100 deletions

File tree

scripts/1-fetch/europeana_fetch.py

Lines changed: 95 additions & 100 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
import textwrap
1616
import time
1717
import traceback
18+
from operator import itemgetter
1819

1920
# Third-party
2021
import requests
@@ -42,7 +43,7 @@
4243
FILE_WITHOUT_THEMES = shared.path_join(
4344
PATHS["data_phase"], "europeana_without_themes.csv"
4445
)
45-
HEADER_WITH_THEMES = ["DATA_PROVIDER", "LEGAL_TOOL", "THEME", "COUNT"]
46+
HEADER_WITH_THEMES = ["DATA_PROVIDER", "THEME", "LEGAL_TOOL", "COUNT"]
4647
HEADER_WITHOUT_THEMES = ["DATA_PROVIDER", "LEGAL_TOOL", "COUNT"]
4748
QUARTER = os.path.basename(PATHS["data_quarter"])
4849
TIMEOUT = 25
@@ -205,12 +206,16 @@ def simplify_legal_tool(legal_tool):
205206

206207

207208
def get_facet_list(session, facet_field):
208-
"""Fetch complete facet list from Europeana API for a given facet field."""
209+
"""
210+
Fetch complete facet list from Europeana API for a given facet field,
211+
returning both label and count, sorted by count descending.
212+
Returns: list of dicts: [{'label': ..., 'count': ...}]
213+
"""
209214
all_values = []
210215
offset = 0
211216
limit = 1000
212217

213-
LOGGER.info(f"Fetching {facet_field} facet values.")
218+
LOGGER.info(f"Fetching {facet_field} facet values with counts.")
214219

215220
while True:
216221
params = {
@@ -238,13 +243,13 @@ def get_facet_list(session, facet_field):
238243
break
239244

240245
fields = facets[0]["fields"]
241-
new_values = [f["label"] for f in fields if f.get("label")]
242-
243-
for v in new_values:
244-
if v not in all_values:
245-
all_values.append(v)
246+
for f in fields:
247+
label = f.get("label")
248+
count = f.get("count", 0)
249+
if label and not any(d["label"] == label for d in all_values):
250+
all_values.append({"label": label, "count": count})
246251

247-
if len(new_values) < limit:
252+
if len(fields) < limit:
248253
break
249254

250255
offset += limit
@@ -253,149 +258,134 @@ def get_facet_list(session, facet_field):
253258
LOGGER.info(
254259
f"Completed fetching {facet_field}. Total unique: {len(all_values)}"
255260
)
256-
all_values.sort()
257-
return all_values
258261

262+
# Sort by count descending
263+
all_values.sort(key=lambda x: x["count"], reverse=True)
264+
return all_values
259265

260-
def fetch_europeana_data_without_themes(session, limit=None):
261-
"""Fetch counts by DATA_PROVIDER and RIGHTS using facets."""
262-
LOGGER.info("Fetching Europeana counts without themes.")
263266

264-
params = {
265-
"wskey": EUROPEANA_API_KEY,
266-
"query": "*",
267-
"rows": 0,
268-
"profile": "facets",
269-
"facet": ["DATA_PROVIDER", "RIGHTS"],
270-
"f.DATA_PROVIDER.facet.limit": 1000,
271-
"f.RIGHTS.facet.limit": 100,
272-
}
267+
def fetch_europeana_data_without_themes(
268+
session, providers_full, rights_full, limit=None
269+
):
270+
"""
271+
Fetch counts per DATA_PROVIDER × RIGHTS using pre-fetched facets.
272+
"""
273+
output = []
273274

274-
try:
275-
resp = session.get(BASE_URL, params=params, timeout=TIMEOUT)
276-
resp.raise_for_status()
277-
data = resp.json()
278-
except requests.RequestException as e:
279-
LOGGER.error(f"Failed to fetch facets: {e}")
280-
return []
281-
282-
facets = {f["name"]: f["fields"] for f in data.get("facets", [])}
283-
provider_fields = facets.get("DATA_PROVIDER", [])
284-
rights_fields = facets.get("RIGHTS", [])
275+
# Filter non-zero providers
276+
providers_nonzero = [p["label"] for p in providers_full if p["count"] > 0]
285277
if limit:
286-
provider_fields = provider_fields[:limit]
278+
providers_nonzero = providers_nonzero[:limit]
287279

288-
output = []
289-
for provider_entry in provider_fields:
290-
provider = provider_entry["label"]
291-
provider_count = provider_entry["count"]
292-
if provider_count == 0:
293-
continue
294-
LOGGER.info(f"Fetching rights data for provider={provider}")
295-
for rights_entry in rights_fields:
296-
rights = rights_entry["label"]
297-
query = f'DATA_PROVIDER:"{provider}" AND RIGHTS:"{rights}"'
280+
# Filter non-zero rights
281+
rights_nonzero = [r["label"] for r in rights_full if r["count"] > 0]
282+
283+
for i, provider in enumerate(providers_nonzero, start=1):
284+
LOGGER.info(
285+
f"[{i}/{len(providers_nonzero)}] "
286+
f"Fetching counts for provider={provider}"
287+
)
288+
289+
for rights_url in rights_nonzero:
290+
simplified_rights = simplify_legal_tool(rights_url)
291+
query = f'DATA_PROVIDER:"{provider}" AND RIGHTS:"{rights_url}"'
298292
params_detail = {
299293
"wskey": EUROPEANA_API_KEY,
300294
"rows": 0,
301295
"query": query,
302296
}
303297
try:
304-
resp_detail = session.get(
298+
resp = session.get(
305299
BASE_URL, params=params_detail, timeout=TIMEOUT
306300
)
307-
resp_detail.raise_for_status()
308-
count = resp_detail.json().get("totalResults", 0)
301+
resp.raise_for_status()
302+
count = resp.json().get("totalResults", 0)
309303
if count > 0:
310304
output.append(
311305
{
312306
"DATA_PROVIDER": provider,
313-
"LEGAL_TOOL": simplify_legal_tool(rights),
307+
"LEGAL_TOOL": simplified_rights,
314308
"COUNT": count,
315309
}
316310
)
317-
318311
except requests.RequestException as e:
319312
LOGGER.warning(
320-
f"Failed for provider={provider}, rights={rights}: {e}"
313+
f"Failed for provider={provider}, rights={rights_url}: {e}"
321314
)
322315
time.sleep(0.01)
323-
LOGGER.info(f"Aggregated {len(output)} records (without themes).")
324-
return output
325316

317+
# Sort by DATA_PROVIDER, LEGAL_TOOL
318+
output = sorted(output, key=itemgetter("DATA_PROVIDER", "LEGAL_TOOL"))
319+
320+
LOGGER.info(
321+
f"Aggregated {len(output)} records for provider-rights counts."
322+
)
323+
return output
326324

327-
def fetch_europeana_data_with_themes(session, themes, limit=None):
328-
"""Fetch counts by DATA_PROVIDER, RIGHTS, and THEME using facets."""
329-
LOGGER.info("Fetching Europeana counts with themes")
330325

331-
params = {
332-
"wskey": EUROPEANA_API_KEY,
333-
"query": "*",
334-
"rows": 0,
335-
"profile": "facets",
336-
"facet": ["DATA_PROVIDER", "RIGHTS"],
337-
"f.DATA_PROVIDER.facet.limit": 1000,
338-
"f.RIGHTS.facet.limit": 100,
339-
}
326+
def fetch_europeana_data_with_themes(
327+
session, providers_full, rights_full, themes, limit=None
328+
):
329+
"""
330+
Fetch counts per DATA_PROVIDER × RIGHTS × THEME
331+
Uses pre-fetched providers_full and rights_full lists.
332+
"""
333+
output = []
340334

341-
try:
342-
resp = session.get(BASE_URL, params=params, timeout=TIMEOUT)
343-
resp.raise_for_status()
344-
data = resp.json()
345-
except requests.RequestException as e:
346-
LOGGER.error(f"Failed to fetch facets: {e}")
347-
return []
348-
349-
facets = {f["name"]: f["fields"] for f in data.get("facets", [])}
350-
provider_fields = facets.get("DATA_PROVIDER", [])
351-
rights_fields = facets.get("RIGHTS", [])
335+
# Filter non-zero providers
336+
providers_nonzero = [p["label"] for p in providers_full if p["count"] > 0]
352337
if limit:
353-
provider_fields = provider_fields[:limit]
338+
providers_nonzero = providers_nonzero[:limit]
354339

355-
output = []
356-
for provider_entry in provider_fields:
357-
provider = provider_entry["label"]
358-
provider_count = provider_entry["count"]
359-
if provider_count == 0:
360-
continue
361-
LOGGER.info(f"Fetching theme+rights data for provider={provider}")
362-
for rights_entry in rights_fields:
363-
rights = rights_entry["label"]
364-
simplified_rights = simplify_legal_tool(rights)
340+
# Filter non-zero rights
341+
rights_nonzero = [r["label"] for r in rights_full if r["count"] > 0]
365342

343+
for i, provider in enumerate(providers_nonzero, start=1):
344+
LOGGER.info(
345+
f"[{i}/{len(providers_nonzero)}]"
346+
f"Fetching rights+theme counts for provider={provider}"
347+
)
348+
349+
for rights_url in rights_nonzero:
350+
simplified_rights = simplify_legal_tool(rights_url)
366351
for theme in themes:
367-
query = f'DATA_PROVIDER:"{provider}" AND RIGHTS:"{rights}"'
352+
query = f'DATA_PROVIDER:"{provider}" AND RIGHTS:"{rights_url}"'
368353
params_detail = {
369354
"wskey": EUROPEANA_API_KEY,
370355
"rows": 0,
371356
"query": query,
372357
"theme": theme,
373358
}
374359
try:
375-
resp_detail = session.get(
360+
resp = session.get(
376361
BASE_URL, params=params_detail, timeout=TIMEOUT
377362
)
378-
resp_detail.raise_for_status()
379-
count = resp_detail.json().get("totalResults", 0)
363+
resp.raise_for_status()
364+
count = resp.json().get("totalResults", 0)
380365
if count > 0:
381366
output.append(
382367
{
383368
"DATA_PROVIDER": provider,
384-
"LEGAL_TOOL": simplified_rights,
385369
"THEME": theme,
370+
"LEGAL_TOOL": simplified_rights,
386371
"COUNT": count,
387372
}
388373
)
389-
390374
except requests.RequestException as e:
391375
LOGGER.warning(
392-
f"Failed for provider={provider}, "
393-
f"rights={rights}, "
394-
f"theme={theme}: "
395-
f"{e}"
376+
f"Failed for provider={provider},"
377+
f"rights={rights_url}, theme={theme}: {e}"
396378
)
397379
time.sleep(0.01)
398-
LOGGER.info(f"Aggregated {len(output)} records (with themes).")
380+
381+
# Sort by DATA_PROVIDER, THEME, LEGAL_TOOL
382+
output = sorted(
383+
output, key=itemgetter("DATA_PROVIDER", "THEME", "LEGAL_TOOL")
384+
)
385+
386+
LOGGER.info(
387+
f"Aggregated {len(output)} records for provider-rights-theme counts."
388+
)
399389
return output
400390

401391

@@ -445,17 +435,22 @@ def main():
445435

446436
session = get_requests_session()
447437

438+
# Fetch facet lists once, including counts
448439
providers_full = get_facet_list(session, "DATA_PROVIDER")
449440
rights_full = get_facet_list(session, "RIGHTS")
441+
450442
LOGGER.info(f"Facet providers loaded: {len(providers_full)}")
451443
LOGGER.info(f"Facet rights loaded: {len(rights_full)}")
444+
445+
# Pass facets to fetch functions
452446
data_no_theme = fetch_europeana_data_without_themes(
453-
session, limit=args.limit
447+
session, providers_full, rights_full, limit=args.limit
454448
)
455449
data_with_theme = fetch_europeana_data_with_themes(
456-
session, THEMES, limit=args.limit
450+
session, providers_full, rights_full, THEMES, limit=args.limit
457451
)
458452

453+
# Write to CSV and optionally push to git
459454
args = write_data(args, data_no_theme, data_with_theme)
460455
args = shared.git_add_and_commit(
461456
args,

0 commit comments

Comments
 (0)