2626
2727# Add parent directory so shared can be imported
2828sys .path .append (os .path .join (os .path .dirname (__file__ ), ".." ))
29- # Add dev directory for category converter
30- sys .path .append (os .path .join (os .path .dirname (__file__ ), ".." , ".." , "dev" ))
31-
32- # Third-party
33- import arxiv_category_converter # noqa: E402
3429
3530# First-party/Local
3631import shared # noqa: E402
4944# HTTP Retry Configuration (using shared constants where available)
5045RETRY_TOTAL = 5
5146RETRY_BACKOFF_FACTOR = 1
52- # STATUS_FORCELIST imported from shared.py
47+
5348
5449# Search Queries
5550SEARCH_QUERIES = [
7368
7469# File Paths
7570FILE_ARXIV_COUNT = shared .path_join (PATHS ["data_1-fetch" ], "arxiv_1_count.csv" )
76- FILE_ARXIV_CATEGORY = shared .path_join (
77- PATHS ["data_1-fetch" ], "arxiv_2_count_by_category.csv"
78- )
7971FILE_ARXIV_CATEGORY_REPORT = shared .path_join (
8072 PATHS ["data_1-fetch" ], "arxiv_2_count_by_category_report.csv"
8173)
8577FILE_ARXIV_YEAR = shared .path_join (
8678 PATHS ["data_1-fetch" ], "arxiv_3_count_by_year.csv"
8779)
88- FILE_ARXIV_AUTHOR = shared .path_join (
89- PATHS ["data_1-fetch" ], "arxiv_4_count_by_author_count.csv"
90- )
9180FILE_ARXIV_AUTHOR_BUCKET = shared .path_join (
9281 PATHS ["data_1-fetch" ], "arxiv_4_count_by_author_bucket.csv"
9382)
9483# records metadata for each run for audit, reproducibility, and provenance
9584FILE_PROVENANCE = shared .path_join (PATHS ["data" ], "arxiv_provenance.yaml" )
9685
9786HEADER_COUNT = ["TOOL_IDENTIFIER" , "COUNT" ]
98- HEADER_CATEGORY = ["TOOL_IDENTIFIER" , "CATEGORY" , "COUNT" ]
9987HEADER_CATEGORY_REPORT = [
10088 "TOOL_IDENTIFIER" ,
10189 "CATEGORY_CODE" ,
10492 "PERCENT" ,
10593]
10694HEADER_YEAR = ["TOOL_IDENTIFIER" , "YEAR" , "COUNT" ]
107- HEADER_AUTHOR = ["TOOL_IDENTIFIER" , "AUTHOR_COUNT" , "COUNT" ]
10895HEADER_AUTHOR_BUCKET = ["TOOL_IDENTIFIER" , "AUTHOR_BUCKET" , "COUNT" ]
10996
11097QUARTER = os .path .basename (PATHS ["data_quarter" ])
136123LOGGER .info ("Script execution started." )
137124
138125
139- def load_category_map (paths ):
140- """Load category->label mapping from data/arxiv_category_map.yaml.
141- Returns a dict (possibly empty) and logs failures silently.
142- """
143- paths_to_check = []
144- # use the repository data directory
145- repository_data_dir = (
146- paths .get ("data" ) if isinstance (paths , dict ) else None
147- )
148- if repository_data_dir :
149- paths_to_check .append (
150- os .path .join (repository_data_dir , "arxiv_category_map.yaml" )
151- )
152-
153- # allow for looking two levels up (data/)
154- paths_to_check .append (
155- os .path .join (
156- os .path .dirname (__file__ ),
157- ".." ,
158- ".." ,
159- "data" ,
160- "arxiv_category_map.yaml" ,
161- )
162- )
163-
164- for p in paths_to_check :
165- p = os .path .abspath (os .path .realpath (p ))
166- try :
167- if os .path .exists (p ):
168- with open (p , "r" , encoding = "utf-8" ) as fh :
169- data = yaml .safe_load (fh )
170- if isinstance (data , dict ):
171- # Normalise keys/values to strings for readability
172- return {str (k ).strip (): str (v ) for k , v in data .items ()}
173- except Exception as e :
174- LOGGER = globals ().get ("LOGGER" )
175- if LOGGER :
176- LOGGER .warning ("Failed to load category map %s: %s" , p , e )
177- else :
178- print (
179- f"Warning: Failed to load category map { p } : { e } " ,
180- file = sys .stderr ,
181- )
182- return {}
183-
184-
185126# parsing arguments function
186127def parse_arguments ():
187128 """Parse command-line options, returns parsed argument namespace."""
@@ -229,9 +170,8 @@ def initialize_all_data_files(args):
229170
230171 os .makedirs (PATHS ["data_1-fetch" ], exist_ok = True )
231172 initialize_data_file (FILE_ARXIV_COUNT , HEADER_COUNT )
232- initialize_data_file (FILE_ARXIV_CATEGORY , HEADER_CATEGORY )
173+ initialize_data_file (FILE_ARXIV_CATEGORY_REPORT , HEADER_CATEGORY_REPORT )
233174 initialize_data_file (FILE_ARXIV_YEAR , HEADER_YEAR )
234- initialize_data_file (FILE_ARXIV_AUTHOR , HEADER_AUTHOR )
235175 initialize_data_file (FILE_ARXIV_AUTHOR_BUCKET , HEADER_AUTHOR_BUCKET )
236176
237177
@@ -249,7 +189,12 @@ def get_requests_session():
249189
250190
251191def normalize_license_text (raw_text ):
252- """Normalize license text to standard CC license identifiers."""
192+ """
193+ Convert raw license text to standardized CC license identifiers.
194+
195+ Uses regex patterns to identify CC licenses from paper text.
196+ Returns specific license (e.g., "CC BY", "CC0") or "Unknown".
197+ """
253198 if not raw_text :
254199 return "Unknown"
255200
@@ -261,7 +206,12 @@ def normalize_license_text(raw_text):
261206
262207
263208def extract_license_info (entry ):
264- """Extract CC license information from ArXiv entry."""
209+ """
210+ Extract CC license information from ArXiv paper entry.
211+
212+ Checks rights field first, then summary field for license patterns.
213+ Returns normalized license identifier or "Unknown".
214+ """
265215 # checking through the rights field first then summary
266216 if hasattr (entry , "rights" ) and entry .rights :
267217 license_info = normalize_license_text (entry .rights )
@@ -314,6 +264,12 @@ def extract_author_count_from_entry(entry):
314264
315265
316266def bucket_author_count (n ):
267+ """
268+ Convert author count to predefined buckets for analysis.
269+
270+ Buckets: "1", "2-3", "4-6", "7-10", "11+", "Unknown"
271+ Reduces granularity for better statistical analysis.
272+ """
317273 if n is None :
318274 return "Unknown"
319275 if n == 1 :
@@ -330,6 +286,10 @@ def bucket_author_count(n):
330286def save_count_data (
331287 license_counts , category_counts , year_counts , author_counts
332288):
289+ """
290+ Save all collected data to CSV files.
291+
292+ """
333293 # license_counts: {license: count}
334294 # category_counts: {license: {category_code: count}}
335295 # year_counts: {license: {year: count}}
@@ -342,16 +302,6 @@ def save_count_data(
342302 for lic , c in license_counts .items ():
343303 writer .writerow ({"TOOL_IDENTIFIER" : lic , "COUNT" : c })
344304
345- # Save detailed category counts (code)
346- with open (FILE_ARXIV_CATEGORY , "w" , newline = "" , encoding = "utf-8" ) as fh :
347- writer = csv .DictWriter (fh , fieldnames = HEADER_CATEGORY , dialect = "unix" )
348- writer .writeheader ()
349- for lic , cats in category_counts .items ():
350- for code , c in cats .items ():
351- writer .writerow (
352- {"TOOL_IDENTIFIER" : lic , "CATEGORY" : code , "COUNT" : c }
353- )
354-
355305 # Save category report with labels and percent
356306 with open (
357307 FILE_ARXIV_CATEGORY_REPORT , "w" , newline = "" , encoding = "utf-8"
@@ -363,14 +313,7 @@ def save_count_data(
363313 for lic , cats in category_counts .items ():
364314 total_for_license = sum (cats .values ()) or 1
365315 for code , c in cats .items ():
366- label = CATEGORY_LABELS .get (
367- code ,
368- (
369- code .split ("." )[0 ].upper ()
370- if code and "." in code
371- else code
372- ),
373- )
316+ label = shared .normalize_arxiv_category (code , CATEGORY_LABELS )
374317 pct = round ((c / total_for_license ) * 100 , 2 )
375318 writer .writerow (
376319 {
@@ -391,6 +334,7 @@ def save_count_data(
391334 fh ,
392335 fieldnames = [
393336 "TOOL_IDENTIFIER" ,
337+ "CATEGORY_CODE" ,
394338 "CATEGORY_LABEL" ,
395339 "COUNT" ,
396340 "PERCENT" ,
@@ -407,17 +351,11 @@ def save_count_data(
407351 others = sorted_cats [TOP_N :]
408352 other_count = sum (c for _ , c in others )
409353 for code , c in top :
410- label = CATEGORY_LABELS .get (
411- code ,
412- (
413- code .split ("." )[0 ].upper ()
414- if code and "." in code
415- else code
416- ),
417- )
354+ label = shared .normalize_arxiv_category (code , CATEGORY_LABELS )
418355 writer .writerow (
419356 {
420357 "TOOL_IDENTIFIER" : lic ,
358+ "CATEGORY_CODE" : code ,
421359 "CATEGORY_LABEL" : label ,
422360 "COUNT" : c ,
423361 "PERCENT" : round ((c / total_for_license ) * 100 , 2 ),
@@ -427,6 +365,7 @@ def save_count_data(
427365 writer .writerow (
428366 {
429367 "TOOL_IDENTIFIER" : lic ,
368+ "CATEGORY_CODE" : "OTHER" ,
430369 "CATEGORY_LABEL" : "Other" ,
431370 "COUNT" : other_count ,
432371 "PERCENT" : round (
@@ -445,20 +384,6 @@ def save_count_data(
445384 {"TOOL_IDENTIFIER" : lic , "YEAR" : year , "COUNT" : c }
446385 )
447386
448- # Save detailed author counts (AUTHOR_COUNT as integer or Unknown)
449- with open (FILE_ARXIV_AUTHOR , "w" , newline = "" , encoding = "utf-8" ) as fh :
450- writer = csv .DictWriter (fh , fieldnames = HEADER_AUTHOR , dialect = "unix" )
451- writer .writeheader ()
452- for lic , acs in author_counts .items ():
453- for ac , c in acs .items ():
454- writer .writerow (
455- {
456- "TOOL_IDENTIFIER" : lic ,
457- "AUTHOR_COUNT" : ac if ac is not None else "Unknown" ,
458- "COUNT" : c ,
459- }
460- )
461-
462387 # Save author buckets summary
463388 with open (
464389 FILE_ARXIV_AUTHOR_BUCKET , "w" , newline = "" , encoding = "utf-8"
@@ -480,17 +405,17 @@ def save_count_data(
480405
481406
482407def query_arxiv (args ):
483- """Query ArXiv API for papers with potential CC licenses."""
408+ """
409+ Main function to query ArXiv API and collect CC license data.
410+
411+ """
484412
485413 LOGGER .info ("Beginning to fetch results from ArXiv API" )
486414 session = get_requests_session ()
487- try :
488- loaded = load_category_map (PATHS )
489- if loaded :
490- # overlay loaded map over default
491- CATEGORY_LABELS .update (loaded )
492- except Exception as e :
493- LOGGER .warning ("Error loading external arXiv category map: %s" , e )
415+
416+ # Load category mappings using shared function
417+ CATEGORY_LABELS .update (shared .load_arxiv_categories (PATHS .get ("data" )))
418+
494419 results_per_iteration = RESULTS_PER_REQUEST
495420
496421 search_queries = SEARCH_QUERIES
@@ -584,25 +509,11 @@ def query_arxiv(args):
584509 consecutive_empty_calls = 0
585510
586511 # Save results
587-
588512 if args .enable_save :
589513 save_count_data (
590514 license_counts , category_counts , year_counts , author_counts
591515 )
592516
593- # Convert category codes to user-friendly names
594- try :
595- input_file = FILE_ARXIV_CATEGORY
596- output_file = shared .path_join (
597- PATHS ["data_1-fetch" ], "arxiv_2_count_by_category_report.csv"
598- )
599- arxiv_category_converter .convert_categories_to_friendly_names (
600- input_file , output_file , PATHS ["data" ]
601- )
602- LOGGER .info (f"Category conversion completed: { output_file } " )
603- except Exception as e :
604- LOGGER .warning (f"Category conversion failed: { e } " )
605-
606517 # save provenance
607518 provenance_data = {
608519 "total_fetched" : total_fetched ,
0 commit comments