1515import textwrap
1616import time
1717import traceback
18+ from operator import itemgetter
1819
1920# Third-party
2021import requests
4243FILE_WITHOUT_THEMES = shared .path_join (
4344 PATHS ["data_phase" ], "europeana_without_themes.csv"
4445)
45- HEADER_WITH_THEMES = ["DATA_PROVIDER" , "LEGAL_TOOL " , "THEME " , "COUNT" ]
46+ HEADER_WITH_THEMES = ["DATA_PROVIDER" , "THEME " , "LEGAL_TOOL " , "COUNT" ]
4647HEADER_WITHOUT_THEMES = ["DATA_PROVIDER" , "LEGAL_TOOL" , "COUNT" ]
4748QUARTER = os .path .basename (PATHS ["data_quarter" ])
4849TIMEOUT = 25
@@ -205,12 +206,16 @@ def simplify_legal_tool(legal_tool):
205206
206207
207208def get_facet_list (session , facet_field ):
208- """Fetch complete facet list from Europeana API for a given facet field."""
209+ """
210+ Fetch complete facet list from Europeana API for a given facet field,
211+ returning both label and count, sorted by count descending.
212+ Returns: list of dicts: [{'label': ..., 'count': ...}]
213+ """
209214 all_values = []
210215 offset = 0
211216 limit = 1000
212217
213- LOGGER .info (f"Fetching { facet_field } facet values." )
218+ LOGGER .info (f"Fetching { facet_field } facet values with counts ." )
214219
215220 while True :
216221 params = {
@@ -238,13 +243,13 @@ def get_facet_list(session, facet_field):
238243 break
239244
240245 fields = facets [0 ]["fields" ]
241- new_values = [ f [ "label" ] for f in fields if f . get ( "label" )]
242-
243- for v in new_values :
244- if v not in all_values :
245- all_values .append (v )
246+ for f in fields :
247+ label = f . get ( "label" )
248+ count = f . get ( "count" , 0 )
249+ if label and not any ( d [ "label" ] == label for d in all_values ) :
250+ all_values .append ({ "label" : label , "count" : count } )
246251
247- if len (new_values ) < limit :
252+ if len (fields ) < limit :
248253 break
249254
250255 offset += limit
@@ -253,149 +258,134 @@ def get_facet_list(session, facet_field):
253258 LOGGER .info (
254259 f"Completed fetching { facet_field } . Total unique: { len (all_values )} "
255260 )
256- all_values .sort ()
257- return all_values
258261
262+ # Sort by count descending
263+ all_values .sort (key = lambda x : x ["count" ], reverse = True )
264+ return all_values
259265
260- def fetch_europeana_data_without_themes (session , limit = None ):
261- """Fetch counts by DATA_PROVIDER and RIGHTS using facets."""
262- LOGGER .info ("Fetching Europeana counts without themes." )
263266
264- params = {
265- "wskey" : EUROPEANA_API_KEY ,
266- "query" : "*" ,
267- "rows" : 0 ,
268- "profile" : "facets" ,
269- "facet" : ["DATA_PROVIDER" , "RIGHTS" ],
270- "f.DATA_PROVIDER.facet.limit" : 1000 ,
271- "f.RIGHTS.facet.limit" : 100 ,
272- }
267+ def fetch_europeana_data_without_themes (
268+ session , providers_full , rights_full , limit = None
269+ ):
270+ """
271+ Fetch counts per DATA_PROVIDER × RIGHTS using pre-fetched facets.
272+ """
273+ output = []
273274
274- try :
275- resp = session .get (BASE_URL , params = params , timeout = TIMEOUT )
276- resp .raise_for_status ()
277- data = resp .json ()
278- except requests .RequestException as e :
279- LOGGER .error (f"Failed to fetch facets: { e } " )
280- return []
281-
282- facets = {f ["name" ]: f ["fields" ] for f in data .get ("facets" , [])}
283- provider_fields = facets .get ("DATA_PROVIDER" , [])
284- rights_fields = facets .get ("RIGHTS" , [])
275+ # Filter non-zero providers
276+ providers_nonzero = [p ["label" ] for p in providers_full if p ["count" ] > 0 ]
285277 if limit :
286- provider_fields = provider_fields [:limit ]
278+ providers_nonzero = providers_nonzero [:limit ]
287279
288- output = []
289- for provider_entry in provider_fields :
290- provider = provider_entry ["label" ]
291- provider_count = provider_entry ["count" ]
292- if provider_count == 0 :
293- continue
294- LOGGER .info (f"Fetching rights data for provider={ provider } " )
295- for rights_entry in rights_fields :
296- rights = rights_entry ["label" ]
297- query = f'DATA_PROVIDER:"{ provider } " AND RIGHTS:"{ rights } "'
280+ # Filter non-zero rights
281+ rights_nonzero = [r ["label" ] for r in rights_full if r ["count" ] > 0 ]
282+
283+ for i , provider in enumerate (providers_nonzero , start = 1 ):
284+ LOGGER .info (
285+ f"[{ i } /{ len (providers_nonzero )} ] "
286+ f"Fetching counts for provider={ provider } "
287+ )
288+
289+ for rights_url in rights_nonzero :
290+ simplified_rights = simplify_legal_tool (rights_url )
291+ query = f'DATA_PROVIDER:"{ provider } " AND RIGHTS:"{ rights_url } "'
298292 params_detail = {
299293 "wskey" : EUROPEANA_API_KEY ,
300294 "rows" : 0 ,
301295 "query" : query ,
302296 }
303297 try :
304- resp_detail = session .get (
298+ resp = session .get (
305299 BASE_URL , params = params_detail , timeout = TIMEOUT
306300 )
307- resp_detail .raise_for_status ()
308- count = resp_detail .json ().get ("totalResults" , 0 )
301+ resp .raise_for_status ()
302+ count = resp .json ().get ("totalResults" , 0 )
309303 if count > 0 :
310304 output .append (
311305 {
312306 "DATA_PROVIDER" : provider ,
313- "LEGAL_TOOL" : simplify_legal_tool ( rights ) ,
307+ "LEGAL_TOOL" : simplified_rights ,
314308 "COUNT" : count ,
315309 }
316310 )
317-
318311 except requests .RequestException as e :
319312 LOGGER .warning (
320- f"Failed for provider={ provider } , rights={ rights } : { e } "
313+ f"Failed for provider={ provider } , rights={ rights_url } : { e } "
321314 )
322315 time .sleep (0.01 )
323- LOGGER .info (f"Aggregated { len (output )} records (without themes)." )
324- return output
325316
317+ # Sort by DATA_PROVIDER, LEGAL_TOOL
318+ output = sorted (output , key = itemgetter ("DATA_PROVIDER" , "LEGAL_TOOL" ))
319+
320+ LOGGER .info (
321+ f"Aggregated { len (output )} records for provider-rights counts."
322+ )
323+ return output
326324
327- def fetch_europeana_data_with_themes (session , themes , limit = None ):
328- """Fetch counts by DATA_PROVIDER, RIGHTS, and THEME using facets."""
329- LOGGER .info ("Fetching Europeana counts with themes" )
330325
331- params = {
332- "wskey" : EUROPEANA_API_KEY ,
333- "query" : "*" ,
334- "rows" : 0 ,
335- "profile" : "facets" ,
336- "facet" : ["DATA_PROVIDER" , "RIGHTS" ],
337- "f.DATA_PROVIDER.facet.limit" : 1000 ,
338- "f.RIGHTS.facet.limit" : 100 ,
339- }
326+ def fetch_europeana_data_with_themes (
327+ session , providers_full , rights_full , themes , limit = None
328+ ):
329+ """
330+ Fetch counts per DATA_PROVIDER × RIGHTS × THEME
331+ Uses pre-fetched providers_full and rights_full lists.
332+ """
333+ output = []
340334
341- try :
342- resp = session .get (BASE_URL , params = params , timeout = TIMEOUT )
343- resp .raise_for_status ()
344- data = resp .json ()
345- except requests .RequestException as e :
346- LOGGER .error (f"Failed to fetch facets: { e } " )
347- return []
348-
349- facets = {f ["name" ]: f ["fields" ] for f in data .get ("facets" , [])}
350- provider_fields = facets .get ("DATA_PROVIDER" , [])
351- rights_fields = facets .get ("RIGHTS" , [])
335+ # Filter non-zero providers
336+ providers_nonzero = [p ["label" ] for p in providers_full if p ["count" ] > 0 ]
352337 if limit :
353- provider_fields = provider_fields [:limit ]
338+ providers_nonzero = providers_nonzero [:limit ]
354339
355- output = []
356- for provider_entry in provider_fields :
357- provider = provider_entry ["label" ]
358- provider_count = provider_entry ["count" ]
359- if provider_count == 0 :
360- continue
361- LOGGER .info (f"Fetching theme+rights data for provider={ provider } " )
362- for rights_entry in rights_fields :
363- rights = rights_entry ["label" ]
364- simplified_rights = simplify_legal_tool (rights )
340+ # Filter non-zero rights
341+ rights_nonzero = [r ["label" ] for r in rights_full if r ["count" ] > 0 ]
365342
343+ for i , provider in enumerate (providers_nonzero , start = 1 ):
344+ LOGGER .info (
345+ f"[{ i } /{ len (providers_nonzero )} ]"
346+ f"Fetching rights+theme counts for provider={ provider } "
347+ )
348+
349+ for rights_url in rights_nonzero :
350+ simplified_rights = simplify_legal_tool (rights_url )
366351 for theme in themes :
367- query = f'DATA_PROVIDER:"{ provider } " AND RIGHTS:"{ rights } "'
352+ query = f'DATA_PROVIDER:"{ provider } " AND RIGHTS:"{ rights_url } "'
368353 params_detail = {
369354 "wskey" : EUROPEANA_API_KEY ,
370355 "rows" : 0 ,
371356 "query" : query ,
372357 "theme" : theme ,
373358 }
374359 try :
375- resp_detail = session .get (
360+ resp = session .get (
376361 BASE_URL , params = params_detail , timeout = TIMEOUT
377362 )
378- resp_detail .raise_for_status ()
379- count = resp_detail .json ().get ("totalResults" , 0 )
363+ resp .raise_for_status ()
364+ count = resp .json ().get ("totalResults" , 0 )
380365 if count > 0 :
381366 output .append (
382367 {
383368 "DATA_PROVIDER" : provider ,
384- "LEGAL_TOOL" : simplified_rights ,
385369 "THEME" : theme ,
370+ "LEGAL_TOOL" : simplified_rights ,
386371 "COUNT" : count ,
387372 }
388373 )
389-
390374 except requests .RequestException as e :
391375 LOGGER .warning (
392- f"Failed for provider={ provider } , "
393- f"rights={ rights } , "
394- f"theme={ theme } : "
395- f"{ e } "
376+ f"Failed for provider={ provider } ,"
377+ f"rights={ rights_url } , theme={ theme } : { e } "
396378 )
397379 time .sleep (0.01 )
398- LOGGER .info (f"Aggregated { len (output )} records (with themes)." )
380+
381+ # Sort by DATA_PROVIDER, THEME, LEGAL_TOOL
382+ output = sorted (
383+ output , key = itemgetter ("DATA_PROVIDER" , "THEME" , "LEGAL_TOOL" )
384+ )
385+
386+ LOGGER .info (
387+ f"Aggregated { len (output )} records for provider-rights-theme counts."
388+ )
399389 return output
400390
401391
@@ -445,17 +435,22 @@ def main():
445435
446436 session = get_requests_session ()
447437
438+ # Fetch facet lists once, including counts
448439 providers_full = get_facet_list (session , "DATA_PROVIDER" )
449440 rights_full = get_facet_list (session , "RIGHTS" )
441+
450442 LOGGER .info (f"Facet providers loaded: { len (providers_full )} " )
451443 LOGGER .info (f"Facet rights loaded: { len (rights_full )} " )
444+
445+ # Pass facets to fetch functions
452446 data_no_theme = fetch_europeana_data_without_themes (
453- session , limit = args .limit
447+ session , providers_full , rights_full , limit = args .limit
454448 )
455449 data_with_theme = fetch_europeana_data_with_themes (
456- session , THEMES , limit = args .limit
450+ session , providers_full , rights_full , THEMES , limit = args .limit
457451 )
458452
453+ # Write to CSV and optionally push to git
459454 args = write_data (args , data_no_theme , data_with_theme )
460455 args = shared .git_add_and_commit (
461456 args ,
0 commit comments