1313import time
1414import traceback
1515import urllib .parse
16- from collections import defaultdict , Counter
16+ from collections import Counter , defaultdict
1717
1818# Third-party
1919import feedparser
2525from requests .adapters import HTTPAdapter
2626from urllib3 .util .retry import Retry
2727
28-
2928# Add parent directory so shared can be imported
3029sys .path .append (os .path .join (os .path .dirname (__file__ ), ".." ))
3130# Add dev directory for category converter
3231sys .path .append (os .path .join (os .path .dirname (__file__ ), ".." , ".." , "dev" ))
3332
33+ # Third-party
34+ import arxiv_category_converter # noqa: E402
35+
3436# First-party/Local
3537import shared # noqa: E402
36- import arxiv_category_converter # noqa: E402
3738
3839# Setup
3940LOGGER , PATHS = shared .setup (__file__ )
6061 PATHS ["data_1-fetch" ], "arxiv_4_count_by_author_bucket.csv"
6162)
6263# records metadata for each run for audit, reproducibility, and provenance
63- FILE_PROVENANCE = shared .path_join (PATHS ["data_1-fetch" ], "arxiv_provenance.json" )
64+ FILE_PROVENANCE = shared .path_join (
65+ PATHS ["data_1-fetch" ], "arxiv_provenance.json"
66+ )
6467
6568HEADER_COUNT = ["TOOL_IDENTIFIER" , "COUNT" ]
6669HEADER_CATEGORY = ["TOOL_IDENTIFIER" , "CATEGORY" , "COUNT" ]
8184
8285# Compiled regex patterns for CC license detection
8386CC_PATTERNS = [
84- (re .compile (r'\bCC[-\s]?0\b' , re .IGNORECASE ), "CC0" ),
85- (re .compile (r'\bCC[-\s]?BY[-\s]?NC[-\s]?ND\b' , re .IGNORECASE ),
86- "CC BY-NC-ND" ),
87- (re .compile (r'\bCC[-\s]?BY[-\s]?NC[-\s]?SA\b' , re .IGNORECASE ),
88- "CC BY-NC-SA" ),
89- (re .compile (r'\bCC[-\s]?BY[-\s]?ND\b' , re .IGNORECASE ), "CC BY-ND" ),
90- (re .compile (r'\bCC[-\s]?BY[-\s]?SA\b' , re .IGNORECASE ), "CC BY-SA" ),
91- (re .compile (r'\bCC[-\s]?BY[-\s]?NC\b' , re .IGNORECASE ), "CC BY-NC" ),
92- (re .compile (r'\bCC[-\s]?BY\b' , re .IGNORECASE ), "CC BY" ),
93- (re .compile (r'\bCREATIVE\s+COMMONS\b' , re .IGNORECASE ),
94- "UNKNOWN CC legal tool" ),
87+ (re .compile (r"\bCC[-\s]?0\b" , re .IGNORECASE ), "CC0" ),
88+ (
89+ re .compile (r"\bCC[-\s]?BY[-\s]?NC[-\s]?ND\b" , re .IGNORECASE ),
90+ "CC BY-NC-ND" ,
91+ ),
92+ (
93+ re .compile (r"\bCC[-\s]?BY[-\s]?NC[-\s]?SA\b" , re .IGNORECASE ),
94+ "CC BY-NC-SA" ,
95+ ),
96+ (re .compile (r"\bCC[-\s]?BY[-\s]?ND\b" , re .IGNORECASE ), "CC BY-ND" ),
97+ (re .compile (r"\bCC[-\s]?BY[-\s]?SA\b" , re .IGNORECASE ), "CC BY-SA" ),
98+ (re .compile (r"\bCC[-\s]?BY[-\s]?NC\b" , re .IGNORECASE ), "CC BY-NC" ),
99+ (re .compile (r"\bCC[-\s]?BY\b" , re .IGNORECASE ), "CC BY" ),
100+ (
101+ re .compile (r"\bCREATIVE\s+COMMONS\b" , re .IGNORECASE ),
102+ "UNKNOWN CC legal tool" ,
103+ ),
95104]
96105
97106# Log the start of the script execution
98107LOGGER .info ("Script execution started." )
99108
100109
101110def load_category_map (paths ):
102- """Load category->label mapping from data/arxiv_category_map.yaml if present
111+ """Load category->label mapping from data/arxiv_category_map.yaml.
103112 Returns a dict (possibly empty) and logs failures silently.
104113 """
105114 paths_to_check = []
@@ -115,8 +124,11 @@ def load_category_map(paths):
115124 # allow for looking two levels up (data/)
116125 paths_to_check .append (
117126 os .path .join (
118- os .path .dirname (__file__ ), ".." , ".." , "data" ,
119- "arxiv_category_map.yaml"
127+ os .path .dirname (__file__ ),
128+ ".." ,
129+ ".." ,
130+ "data" ,
131+ "arxiv_category_map.yaml" ,
120132 )
121133 )
122134
@@ -136,7 +148,7 @@ def load_category_map(paths):
136148 else :
137149 print (
138150 f"Warning: Failed to load category map { p } : { e } " ,
139- file = sys .stderr
151+ file = sys .stderr ,
140152 )
141153 return {}
142154
@@ -199,7 +211,7 @@ def get_requests_session():
199211 retry_strategy = Retry (
200212 total = 5 ,
201213 backoff_factor = 1 ,
202- status_forcelist = [408 , 429 , 500 , 502 , 503 , 504 ]
214+ status_forcelist = [408 , 429 , 500 , 502 , 503 , 504 ],
203215 )
204216 session = requests .Session ()
205217 session .headers .update ({"User-Agent" : shared .USER_AGENT })
@@ -208,7 +220,7 @@ def get_requests_session():
208220
209221
210222def normalize_license_text (raw_text : str ) -> str :
211- """Normalize license text to standard CC license identifiers using regex ."""
223+ """Normalize license text to standard CC license identifiers."""
212224 if not raw_text :
213225 return "Unknown"
214226
@@ -233,11 +245,12 @@ def extract_license_info(entry):
233245 return "Unknown"
234246
235247
236-
237248def extract_category_from_entry (entry ):
238249 """Extract primary category from ArXiv entry."""
239- if (hasattr (entry , "arxiv_primary_category" ) and
240- entry .arxiv_primary_category ):
250+ if (
251+ hasattr (entry , "arxiv_primary_category" )
252+ and entry .arxiv_primary_category
253+ ):
241254 return entry .arxiv_primary_category .get ("term" , "Unknown" )
242255 if hasattr (entry , "tags" ) and entry .tags :
243256 # Get first category from tags
@@ -283,8 +296,9 @@ def bucket_author_count(n):
283296 return "11+"
284297
285298
286- def save_count_data (license_counts , category_counts , year_counts ,
287- author_counts ):
299+ def save_count_data (
300+ license_counts , category_counts , year_counts , author_counts
301+ ):
288302 # license_counts: {license: count}
289303 # category_counts: {license: {category_code: count}}
290304 # year_counts: {license: {year: count}}
@@ -299,103 +313,122 @@ def save_count_data(license_counts, category_counts, year_counts,
299313
300314 # Save detailed category counts (code)
301315 with open (FILE_ARXIV_CATEGORY , "w" , newline = "" ) as fh :
302- writer = csv .DictWriter (fh , fieldnames = HEADER_CATEGORY ,
303- dialect = "unix" )
316+ writer = csv .DictWriter (fh , fieldnames = HEADER_CATEGORY , dialect = "unix" )
304317 writer .writeheader ()
305318 for lic , cats in category_counts .items ():
306319 for code , c in cats .items ():
307- writer .writerow ({
308- "TOOL_IDENTIFIER" : lic ,
309- "CATEGORY" : code ,
310- "COUNT" : c
311- })
320+ writer .writerow (
321+ {"TOOL_IDENTIFIER" : lic , "CATEGORY" : code , "COUNT" : c }
322+ )
312323
313324 # Save category report with labels and percent
314325 with open (FILE_ARXIV_CATEGORY_REPORT , "w" , newline = "" ) as fh :
315- writer = csv .DictWriter (fh , fieldnames = HEADER_CATEGORY_REPORT ,
316- dialect = "unix" )
326+ writer = csv .DictWriter (
327+ fh , fieldnames = HEADER_CATEGORY_REPORT , dialect = "unix"
328+ )
317329 writer .writeheader ()
318330 for lic , cats in category_counts .items ():
319331 total_for_license = sum (cats .values ()) or 1
320332 for code , c in cats .items ():
321333 label = CATEGORY_LABELS .get (
322334 code ,
323- code .split ("." )[0 ].upper () if code and "." in code else code
335+ (
336+ code .split ("." )[0 ].upper ()
337+ if code and "." in code
338+ else code
339+ ),
324340 )
325341 pct = round ((c / total_for_license ) * 100 , 2 )
326- writer .writerow ({
327- "TOOL_IDENTIFIER" : lic ,
328- "CATEGORY_CODE" : code ,
329- "CATEGORY_LABEL" : label ,
330- "COUNT" : c ,
331- "PERCENT" : pct ,
332- })
342+ writer .writerow (
343+ {
344+ "TOOL_IDENTIFIER" : lic ,
345+ "CATEGORY_CODE" : code ,
346+ "CATEGORY_LABEL" : label ,
347+ "COUNT" : c ,
348+ "PERCENT" : pct ,
349+ }
350+ )
333351
334352 # Save aggregated category report (top N per license, rest -> Other)
335353 TOP_N = 10
336354 with open (FILE_ARXIV_CATEGORY_REPORT_AGGREGATE , "w" , newline = "" ) as fh :
337355 writer = csv .DictWriter (
338356 fh ,
339- fieldnames = ["TOOL_IDENTIFIER" , "CATEGORY_LABEL" , "COUNT" ,
340- "PERCENT" ],
341- dialect = "unix"
357+ fieldnames = [
358+ "TOOL_IDENTIFIER" ,
359+ "CATEGORY_LABEL" ,
360+ "COUNT" ,
361+ "PERCENT" ,
362+ ],
363+ dialect = "unix" ,
342364 )
343365 writer .writeheader ()
344366 for lic , cats in category_counts .items ():
345367 total_for_license = sum (cats .values ()) or 1
346- sorted_cats = sorted (cats .items (), key = lambda x : x [1 ],
347- reverse = True )
368+ sorted_cats = sorted (
369+ cats .items (), key = lambda x : x [1 ], reverse = True
370+ )
348371 top = sorted_cats [:TOP_N ]
349372 others = sorted_cats [TOP_N :]
350373 other_count = sum (c for _ , c in others )
351374 for code , c in top :
352375 label = CATEGORY_LABELS .get (
353376 code ,
354- code .split ("." )[0 ].upper () if code and "." in code else code
377+ (
378+ code .split ("." )[0 ].upper ()
379+ if code and "." in code
380+ else code
381+ ),
382+ )
383+ writer .writerow (
384+ {
385+ "TOOL_IDENTIFIER" : lic ,
386+ "CATEGORY_LABEL" : label ,
387+ "COUNT" : c ,
388+ "PERCENT" : round ((c / total_for_license ) * 100 , 2 ),
389+ }
355390 )
356- writer .writerow ({
357- "TOOL_IDENTIFIER" : lic ,
358- "CATEGORY_LABEL" : label ,
359- "COUNT" : c ,
360- "PERCENT" : round ((c / total_for_license ) * 100 , 2 ),
361- })
362391 if other_count :
363- writer .writerow ({
364- "TOOL_IDENTIFIER" : lic ,
365- "CATEGORY_LABEL" : "Other" ,
366- "COUNT" : other_count ,
367- "PERCENT" : round ((other_count / total_for_license ) * 100 , 2 ),
368- })
392+ writer .writerow (
393+ {
394+ "TOOL_IDENTIFIER" : lic ,
395+ "CATEGORY_LABEL" : "Other" ,
396+ "COUNT" : other_count ,
397+ "PERCENT" : round (
398+ (other_count / total_for_license ) * 100 , 2
399+ ),
400+ }
401+ )
369402
370403 # Save year counts
371404 with open (FILE_ARXIV_YEAR , "w" , newline = "" ) as fh :
372405 writer = csv .DictWriter (fh , fieldnames = HEADER_YEAR , dialect = "unix" )
373406 writer .writeheader ()
374407 for lic , years in year_counts .items ():
375408 for year , c in years .items ():
376- writer .writerow ({
377- "TOOL_IDENTIFIER" : lic ,
378- "YEAR" : year ,
379- "COUNT" : c
380- })
409+ writer .writerow (
410+ {"TOOL_IDENTIFIER" : lic , "YEAR" : year , "COUNT" : c }
411+ )
381412
382413 # Save detailed author counts (AUTHOR_COUNT as integer or Unknown)
383414 with open (FILE_ARXIV_AUTHOR , "w" , newline = "" ) as fh :
384- writer = csv .DictWriter (fh , fieldnames = HEADER_AUTHOR ,
385- dialect = "unix" )
415+ writer = csv .DictWriter (fh , fieldnames = HEADER_AUTHOR , dialect = "unix" )
386416 writer .writeheader ()
387417 for lic , acs in author_counts .items ():
388418 for ac , c in acs .items ():
389- writer .writerow ({
390- "TOOL_IDENTIFIER" : lic ,
391- "AUTHOR_COUNT" : ac if ac is not None else "Unknown" ,
392- "COUNT" : c
393- })
419+ writer .writerow (
420+ {
421+ "TOOL_IDENTIFIER" : lic ,
422+ "AUTHOR_COUNT" : ac if ac is not None else "Unknown" ,
423+ "COUNT" : c ,
424+ }
425+ )
394426
395427 # Save author buckets summary
396428 with open (FILE_ARXIV_AUTHOR_BUCKET , "w" , newline = "" ) as fh :
397- writer = csv .DictWriter (fh , fieldnames = HEADER_AUTHOR_BUCKET ,
398- dialect = "unix" )
429+ writer = csv .DictWriter (
430+ fh , fieldnames = HEADER_AUTHOR_BUCKET , dialect = "unix"
431+ )
399432 writer .writeheader ()
400433 # build buckets across licenses
401434 for lic , acs in author_counts .items ():
@@ -404,11 +437,9 @@ def save_count_data(license_counts, category_counts, year_counts,
404437 b = bucket_author_count (ac )
405438 bucket_counts [b ] += c
406439 for b , c in bucket_counts .items ():
407- writer .writerow ({
408- "TOOL_IDENTIFIER" : lic ,
409- "AUTHOR_BUCKET" : b ,
410- "COUNT" : c
411- })
440+ writer .writerow (
441+ {"TOOL_IDENTIFIER" : lic , "AUTHOR_BUCKET" : b , "COUNT" : c }
442+ )
412443
413444
414445def query_arxiv (args ):
@@ -523,7 +554,8 @@ def query_arxiv(args):
523554 if consecutive_empty_calls >= 2 :
524555 LOGGER .info (
525556 f"No new papers in 2 consecutive calls for "
526- f"query: { search_query } . Moving over to the next query."
557+ f"query: { search_query } . "
558+ f"Moving over to the next query."
527559 )
528560 break
529561 else :
@@ -535,11 +567,13 @@ def query_arxiv(args):
535567 save_count_data (
536568 license_counts , category_counts , year_counts , author_counts
537569 )
538-
570+
539571 # Convert category codes to user-friendly names
540572 try :
541573 input_file = FILE_ARXIV_CATEGORY
542- output_file = shared .path_join (PATHS ["data_1-fetch" ], "arxiv_2_count_by_category_report.csv" )
574+ output_file = shared .path_join (
575+ PATHS ["data_1-fetch" ], "arxiv_2_count_by_category_report.csv"
576+ )
543577 arxiv_category_converter .convert_categories_to_friendly_names (
544578 input_file , output_file , PATHS ["data" ]
545579 )
0 commit comments