11#!/usr/bin/env python
22"""
33Fetch ArXiv papers with CC license information using OAI-PMH API.
4-
5- This script uses ArXiv's OAI-PMH interface to harvest papers with structured
6- license metadata, providing more accurate CC license detection than text-based
7- pattern matching. Focuses on recent years where CC licensing is more common.
84"""
95# Standard library
106import argparse
1612import traceback
1713import xml .etree .ElementTree as ET # XML parsing for OAI-PMH responses
1814from collections import Counter , defaultdict
19- from datetime import datetime # Date calculations for harvesting ranges
15+ from datetime import datetime , timezone
2016from operator import itemgetter
2117
2218# Third-party
@@ -270,19 +266,18 @@ def parse_arguments():
270266 type = int ,
271267 default = DEFAULT_FETCH_LIMIT ,
272268 help = (
273- f"Total limit of papers to fetch "
274- f"(default: { DEFAULT_FETCH_LIMIT } ). "
275- f"Note: Uses OAI-PMH API for structured license data."
269+ f"Total limit of papers to fetch (default: { DEFAULT_FETCH_LIMIT } )."
270+ " Use a value of -1 to remove limit."
276271 ),
277272 )
278273 parser .add_argument (
279274 "--years-back" ,
280275 type = int ,
281276 default = DEFAULT_YEARS_BACK ,
282277 help = (
283- f "Number of years back from current year to harvest "
284- f"(default: { DEFAULT_YEARS_BACK } ). "
285- f"Reduces dataset size and focuses on recent CC-licensed papers ."
278+ "Number of years back from current year to harvest (default: "
279+ f" { DEFAULT_YEARS_BACK } ). Use a value of -1 to specify "
280+ " <earliestDatestamp> ."
286281 ),
287282 )
288283 parser .add_argument (
@@ -295,9 +290,27 @@ def parse_arguments():
295290 action = "store_true" ,
296291 help = "Enable git actions (fetch, merge, add, commit, and push)" ,
297292 )
293+
298294 args = parser .parse_args ()
299295 if not args .enable_save and args .enable_git :
300296 parser .error ("--enable-git requires --enable-save" )
297+ # Restrict args.years_back to earliest datetime and initialize
298+ # args.from_date
299+ #
300+ # Earliest is hard coded here. Occasionally, it should be verified against
301+ # <earliestDatestamp> in: https://oaipmh.arxiv.org/oai?verb=Identify
302+ earliest_date = datetime (2005 , 9 , 16 , tzinfo = timezone .utc )
303+ this_year = datetime .now (timezone .utc ).year
304+ if args .years_back == - 1 :
305+ arg_date = earliest_date
306+ else :
307+ start_year = this_year - args .years_back
308+ arg_date = datetime (start_year , 1 , 1 , tzinfo = timezone .utc )
309+ if arg_date < earliest_date :
310+ arg_date = earliest_date
311+ args .from_date = arg_date .strftime ("%Y-%m-%d" )
312+ args .years_back = this_year - arg_date .year
313+
301314 return args
302315
303316
@@ -531,13 +544,8 @@ def query_arxiv(args):
531544 LOGGER .info ("Beginning to fetch results from ArXiv OAI-PMH API" )
532545 session = shared .get_session ()
533546
534- # Calculate date range for harvesting
535- current_year = datetime .now ().year
536- start_year = current_year - args .years_back
537- from_date = f"{ start_year } -01-01"
538-
539547 LOGGER .info (
540- f"Harvesting papers from { from_date } onwards "
548+ f"Harvesting papers from { args . from_date } onwards "
541549 f"({ args .years_back } years back)"
542550 )
543551
@@ -550,7 +558,10 @@ def query_arxiv(args):
550558 total_fetched = 0
551559 resumption_token = None
552560
553- while total_fetched < args .limit :
561+ # Proceed is set to False when limit reached or end of records (missing
562+ # resumption token)
563+ proceed = True
564+ while proceed :
554565 try :
555566 # Build OAI-PMH request URL
556567 if resumption_token :
@@ -564,7 +575,7 @@ def query_arxiv(args):
564575 query_params = {
565576 "verb" : "ListRecords" ,
566577 "metadataPrefix" : "arXiv" ,
567- "from" : from_date ,
578+ "from" : args . from_date ,
568579 }
569580
570581 # Make API request
@@ -591,7 +602,8 @@ def query_arxiv(args):
591602 batch_cc_count = 0
592603
593604 for record in records :
594- if total_fetched >= args .limit :
605+ if args .limit > 0 and args .limit <= total_fetched :
606+ proceed = False
595607 break
596608
597609 # Convert record to string for metadata extraction
@@ -628,12 +640,14 @@ def query_arxiv(args):
628640 resumption_element = root .find (
629641 ".//{http://www.openarchives.org/OAI/2.0/}resumptionToken"
630642 )
631- if resumption_element is not None and resumption_element .text :
643+ if not proceed :
644+ break
645+ elif resumption_element is not None and resumption_element .text :
632646 resumption_token = resumption_element .text
633647 LOGGER .info ("Continuing with resumption token..." )
634648 else :
635649 LOGGER .info ("No more records available" )
636- break
650+ proceed = False
637651
638652 # OAI-PMH requires a 3 second delay between requests
639653 # https://info.arxiv.org/help/api/tou.html#rate-limits
@@ -657,7 +671,7 @@ def query_arxiv(args):
657671 # Save provenance
658672 provenance_data = {
659673 "total_fetched" : total_fetched ,
660- "from_date" : from_date ,
674+ "from_date" : args . from_date ,
661675 "years_back" : args .years_back ,
662676 "limit" : args .limit ,
663677 "quarter" : QUARTER ,
0 commit comments