11#!/usr/bin/env python
22"""
3- Fetch ArXiv papers with CC license information using OAI-PMH API.
3+ Fetch ArXiv articles with CC license information using OAI-PMH API.
4+ OAI-PMH: Open Archives Initiative Protocol for Metadata Havesting.
45"""
56# Standard library
67import argparse
1213import traceback
1314import xml .etree .ElementTree as ET # XML parsing for OAI-PMH responses
1415from collections import Counter , defaultdict
16+ from copy import copy
1517from datetime import datetime , timezone
1618from operator import itemgetter
1719
3234LOGGER , PATHS = shared .setup (__file__ )
3335
3436# Constants
35- # API Configuration - Updated to use OAI-PMH for structured license data
3637BASE_URL = "https://oaipmh.arxiv.org/oai"
37- # Implementation choice: Set to 1000 CC-licensed papers for balanced collection
38- # This is NOT an ArXiv API requirement - ArXiv only requires "responsible" use
39- # The 3-second delays between requests ensure compliance with OAI-PMH practices
40- DEFAULT_FETCH_LIMIT = 1000 # Default total CC-licensed papers to fetch
41- DEFAULT_YEARS_BACK = 5 # Default years to look back from current year
42-
43- # CSV Headers
44- HEADER_AUTHOR_BUCKET = ["TOOL_IDENTIFIER" , "AUTHOR_BUCKET" , "COUNT" ]
45- HEADER_CATEGORY_REPORT = [
46- "TOOL_IDENTIFIER" ,
47- "CATEGORY_CODE" ,
48- "CATEGORY_LABEL" ,
49- "COUNT" ,
50- ]
51- HEADER_COUNT = ["TOOL_IDENTIFIER" , "COUNT" ]
52- HEADER_YEAR = ["TOOL_IDENTIFIER" , "YEAR" , "COUNT" ]
53-
54- # License mapping for structured data from OAI-PMH
55- LICENSE_MAPPING = {
56- "http://creativecommons.org/licenses/by/3.0/" : "CC BY 3.0" ,
57- "http://creativecommons.org/licenses/by/4.0/" : "CC BY 4.0" ,
58- "http://creativecommons.org/licenses/by-nc/3.0/" : "CC BY-NC 3.0" ,
59- "http://creativecommons.org/licenses/by-nc/4.0/" : "CC BY-NC 4.0" ,
60- "http://creativecommons.org/licenses/by-nc-nd/3.0/" : "CC BY-NC-ND 3.0" ,
61- "http://creativecommons.org/licenses/by-nc-nd/4.0/" : "CC BY-NC-ND 4.0" ,
62- "http://creativecommons.org/licenses/by-nc-sa/3.0/" : "CC BY-NC-SA 3.0" ,
63- "http://creativecommons.org/licenses/by-nc-sa/4.0/" : "CC BY-NC-SA 4.0" ,
64- "http://creativecommons.org/licenses/by-nd/3.0/" : "CC BY-ND 3.0" ,
65- "http://creativecommons.org/licenses/by-nd/4.0/" : "CC BY-ND 4.0" ,
66- "http://creativecommons.org/licenses/by-sa/3.0/" : "CC BY-SA 3.0" ,
67- "http://creativecommons.org/licenses/by-sa/4.0/" : "CC BY-SA 4.0" ,
68- "http://creativecommons.org/licenses/publicdomain" : "CC CERTIFICATION 1.0"
69- " US" ,
70- "http://creativecommons.org/publicdomain/zero/1.0/" : "CC0 1.0" ,
71- "http://creativecommons.org/share-your-work/public-domain/cc0/" : "CC0" ,
72- }
73-
38+ # Defaults should result in quick operation (not complete operation)
7439# ArXiv Categories - manually curated from ArXiv official taxonomy
7540# Source: https://arxiv.org/category_taxonomy
7641CATEGORIES = {
230195 "nucl-th" : "Nuclear Theory" ,
231196 "quant-ph" : "Quantum Physics" ,
232197}
233-
234- # File Paths
198+ DEFAULT_FETCH_LIMIT = 1000
199+ DEFAULT_YEARS_BACK = 5
200+ # CSV file paths
235201FILE_ARXIV_AUTHOR_BUCKET = shared .path_join (
236202 PATHS ["data_1-fetch" ], "arxiv_4_count_by_author_bucket.csv"
237203)
242208FILE_ARXIV_YEAR = shared .path_join (
243209 PATHS ["data_1-fetch" ], "arxiv_3_count_by_year.csv"
244210)
245- # records metadata for each run for audit, reproducibility, and provenance
246211FILE_PROVENANCE = shared .path_join (
247212 PATHS ["data_1-fetch" ], "arxiv_provenance.yaml"
248213)
249-
250- # Runtime variables
214+ # CSV headers
215+ HEADER_AUTHOR_BUCKET = ["TOOL_IDENTIFIER" , "AUTHOR_BUCKET" , "COUNT" ]
216+ HEADER_CATEGORY_REPORT = [
217+ "TOOL_IDENTIFIER" ,
218+ "CATEGORY_CODE" ,
219+ "CATEGORY_LABEL" ,
220+ "COUNT" ,
221+ ]
222+ HEADER_COUNT = ["TOOL_IDENTIFIER" , "COUNT" ]
223+ HEADER_YEAR = ["TOOL_IDENTIFIER" , "YEAR" , "COUNT" ]
251224QUARTER = os .path .basename (PATHS ["data_quarter" ])
252225
253226
254227# parsing arguments function
255228def parse_arguments ():
256229 """Parse command-line options, returns parsed argument namespace.
257230
258- Note: The --limit parameter sets the total number of papers to fetch.
231+ Note: The --limit parameter sets the total number of articles to fetch.
259232 The --years-back parameter limits harvesting to recent years where
260233 CC licensing is more common.
261234 """
262235 LOGGER .info ("Parsing command-line options" )
263236 parser = argparse .ArgumentParser (description = __doc__ )
237+ parser .add_argument (
238+ "--enable-save" ,
239+ action = "store_true" ,
240+ help = "Enable saving results" ,
241+ )
242+ parser .add_argument (
243+ "--enable-git" ,
244+ action = "store_true" ,
245+ help = "Enable git actions (fetch, merge, add, commit, and push)" ,
246+ )
264247 parser .add_argument (
265248 "--limit" ,
266249 type = int ,
267250 default = DEFAULT_FETCH_LIMIT ,
268251 help = (
269- f"Total limit of papers to fetch (default: { DEFAULT_FETCH_LIMIT } ). "
270- " Use a value of -1 to remove limit."
252+ "Limit number of fetched articles (default:"
253+ f" { DEFAULT_FETCH_LIMIT } ). Use a value of -1 to remove limit."
271254 ),
272255 )
273256 parser .add_argument (
274257 "--years-back" ,
275258 type = int ,
276259 default = DEFAULT_YEARS_BACK ,
277260 help = (
278- "Number of years back from current year to harvest (default:"
261+ "Number of years back from current year to fetch (default:"
279262 f" { DEFAULT_YEARS_BACK } ). Use a value of -1 to specify"
280263 " <earliestDatestamp>."
281264 ),
282265 )
283- parser .add_argument (
284- "--enable-save" ,
285- action = "store_true" ,
286- help = "Enable saving results" ,
287- )
288- parser .add_argument (
289- "--enable-git" ,
290- action = "store_true" ,
291- help = "Enable git actions (fetch, merge, add, commit, and push)" ,
292- )
293266
294267 args = parser .parse_args ()
295268 if not args .enable_save and args .enable_git :
@@ -339,6 +312,28 @@ def initialize_all_data_files(args):
339312 initialize_data_file (FILE_ARXIV_AUTHOR_BUCKET , HEADER_AUTHOR_BUCKET )
340313
341314
315+ def get_license_mapping ():
316+ global LICENSE_MAPPING
317+ LOGGER .info ("Loading CC Legal Tool metadata for license mapping" )
318+ file_path = shared .path_join (PATHS ["data" ], "cc-legal-tools.csv" )
319+ license_mapping = {}
320+ with open (file_path , "r" , encoding = "utf-8" ) as file_obj :
321+ rows = csv .DictReader (file_obj , dialect = "unix" )
322+ for row in rows :
323+ simple_url = row ["CANONICAL_URL" ].replace ("https://" , "" )
324+ simple_url = simple_url .rstrip ("/" )
325+ identifier = row ["IDENTIFIER" ]
326+ license_mapping [simple_url ] = identifier
327+
328+ # Add legacy entry
329+ simple_url = "creativecommons.org/licenses/publicdomain"
330+ license_mapping [simple_url ] = "CERTIFICATION 1.0 US"
331+
332+ LICENSE_MAPPING = dict (
333+ sorted (license_mapping .items (), key = lambda item : item [1 ])
334+ )
335+
336+
342337def extract_license_from_xml (record_xml ):
343338 """
344339 Extract CC license information from OAI-PMH XML record.
@@ -355,9 +350,12 @@ def extract_license_from_xml(record_xml):
355350
356351 if license_element is not None and license_element .text :
357352 license_url = license_element .text .strip ()
353+ simple_url = copy (license_url ).replace ("http://" , "" )
354+ simple_url = simple_url .replace ("https://" , "" )
355+ simple_url = simple_url .rstrip ("/" )
358356 # Check exact mapping first
359- if license_url in LICENSE_MAPPING :
360- return LICENSE_MAPPING [license_url ]
357+ if simple_url in LICENSE_MAPPING :
358+ return LICENSE_MAPPING [simple_url ]
361359 # Validate CC URLs more strictly
362360 elif "creativecommons.org" in license_url .lower ():
363361 return f"CC (ambiguous): { license_url } "
@@ -424,17 +422,11 @@ def bucket_author_count(author_count):
424422
425423def query_arxiv (args , session ):
426424 """
427- Main function to query ArXiv OAI-PMH API and collect CC license data.
428-
429- Uses structured license metadata from OAI-PMH instead of text search.
430- Harvests papers from recent years to focus on CC-licensed content.
425+ Query ArXiv OAI-PMH API and return information about CC licensed articles.
431426 """
432-
433- LOGGER .info ("Beginning to fetch results from ArXiv OAI-PMH API" )
434-
435427 LOGGER .info (
436- f"Harvesting papers from { args .from_date } onwards "
437- f"( { args . years_back } years back)"
428+ f"Querying articles from { args .from_date } onwards ( { args . years_back } "
429+ " years back)"
438430 )
439431
440432 # Data structures for counting
@@ -444,6 +436,7 @@ def query_arxiv(args, session):
444436 author_counts = defaultdict (lambda : defaultdict (int ))
445437
446438 total_fetched = 0
439+ cc_articles_found = 0
447440 resumption_token = None
448441
449442 # Proceed is set to False when limit reached or end of records (missing
@@ -501,12 +494,13 @@ def query_arxiv(args, session):
501494 if args .limit > 0 and args .limit <= total_fetched :
502495 proceed = False
503496 break
497+ total_fetched += 1
504498
505499 # Convert record to string for metadata extraction
506500 record_xml = ET .tostring (record , encoding = "unicode" )
507501 metadata = extract_metadata_from_xml (record_xml )
508502
509- # Only process CC-licensed papers
503+ # Only process CC-licensed articles
510504 if metadata ["license" ].startswith ("CC" ):
511505 license_info = metadata ["license" ]
512506 category = metadata ["category" ]
@@ -525,11 +519,12 @@ def query_arxiv(args, session):
525519 # Count by author count and license
526520 author_counts [license_info ][author_count ] += 1
527521
528- total_fetched += 1
529522 batch_cc_count += 1
523+ cc_articles_found += 1
530524
531525 LOGGER .info (
532- f" Batch completed: { batch_cc_count } CC-licensed papers found"
526+ f" Batch CC licensed articles: { batch_cc_count } , Total"
527+ f" CC-licensed articles: { cc_articles_found } "
533528 )
534529
535530 # Check for resumption token
@@ -548,16 +543,13 @@ def query_arxiv(args, session):
548543 # https://info.arxiv.org/help/api/tou.html#rate-limits
549544 time .sleep (3 )
550545
551- LOGGER .info (f"Total papers with CC licenses fetched: { total_fetched } " )
552-
553546 data = {
554547 "author_counts" : author_counts ,
555548 "category_counts" : category_counts ,
556549 "license_counts" : license_counts ,
557550 "year_counts" : year_counts ,
558551 }
559-
560- return data , total_fetched
552+ return data , cc_articles_found
561553
562554
563555def rows_to_csv (args , fieldnames , rows , file_path ):
@@ -634,7 +626,7 @@ def write_data(args, data):
634626 rows_to_csv (args , HEADER_YEAR , rows , FILE_ARXIV_YEAR )
635627
636628
637- def write_provence (args , total_fetched ):
629+ def write_provence (args , cc_articles_found ):
638630 """
639631 Write provenance information to YAML file.
640632 """
@@ -643,7 +635,7 @@ def write_provence(args, total_fetched):
643635
644636 # Save provenance
645637 provenance_data = {
646- "total_fetched " : total_fetched ,
638+ "cc_articles_found " : cc_articles_found ,
647639 "from_date" : args .from_date ,
648640 "years_back" : args .years_back ,
649641 "limit" : args .limit ,
@@ -667,15 +659,15 @@ def write_provence(args, total_fetched):
667659
668660def main ():
669661 """Main function."""
670- LOGGER .info ("Script execution started." )
671662 args = parse_arguments ()
672663 shared .paths_log (LOGGER , PATHS )
673664 shared .git_fetch_and_merge (args , PATHS ["repo" ])
674665 initialize_all_data_files (args )
666+ get_license_mapping ()
675667 session = shared .get_session ()
676- data , total_fetched = query_arxiv (args , session )
668+ data , cc_articles_found = query_arxiv (args , session )
677669 write_data (args , data )
678- write_provence (args , total_fetched )
670+ write_provence (args , cc_articles_found )
679671 args = shared .git_add_and_commit (
680672 args ,
681673 PATHS ["repo" ],
0 commit comments