Skip to content

Commit 81753c9

Browse files
committed
improve --limit and --years-back handling
1 parent d311e19 commit 81753c9

1 file changed

Lines changed: 37 additions & 23 deletions

File tree

scripts/1-fetch/arxiv_fetch.py

Lines changed: 37 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,6 @@
11
#!/usr/bin/env python
22
"""
33
Fetch ArXiv papers with CC license information using OAI-PMH API.
4-
5-
This script uses ArXiv's OAI-PMH interface to harvest papers with structured
6-
license metadata, providing more accurate CC license detection than text-based
7-
pattern matching. Focuses on recent years where CC licensing is more common.
84
"""
95
# Standard library
106
import argparse
@@ -16,7 +12,7 @@
1612
import traceback
1713
import xml.etree.ElementTree as ET # XML parsing for OAI-PMH responses
1814
from collections import Counter, defaultdict
19-
from datetime import datetime # Date calculations for harvesting ranges
15+
from datetime import datetime, timezone
2016
from operator import itemgetter
2117

2218
# Third-party
@@ -270,19 +266,18 @@ def parse_arguments():
270266
type=int,
271267
default=DEFAULT_FETCH_LIMIT,
272268
help=(
273-
f"Total limit of papers to fetch "
274-
f"(default: {DEFAULT_FETCH_LIMIT}). "
275-
f"Note: Uses OAI-PMH API for structured license data."
269+
f"Total limit of papers to fetch (default: {DEFAULT_FETCH_LIMIT})."
270+
" Use a value of -1 to remove limit."
276271
),
277272
)
278273
parser.add_argument(
279274
"--years-back",
280275
type=int,
281276
default=DEFAULT_YEARS_BACK,
282277
help=(
283-
f"Number of years back from current year to harvest "
284-
f"(default: {DEFAULT_YEARS_BACK}). "
285-
f"Reduces dataset size and focuses on recent CC-licensed papers."
278+
"Number of years back from current year to harvest (default:"
279+
f" {DEFAULT_YEARS_BACK}). Use a value of -1 to specify"
280+
" <earliestDatestamp>."
286281
),
287282
)
288283
parser.add_argument(
@@ -295,9 +290,27 @@ def parse_arguments():
295290
action="store_true",
296291
help="Enable git actions (fetch, merge, add, commit, and push)",
297292
)
293+
298294
args = parser.parse_args()
299295
if not args.enable_save and args.enable_git:
300296
parser.error("--enable-git requires --enable-save")
297+
# Restrict args.years_back to earliest datetime and initialize
298+
# args.from_date
299+
#
300+
# Earliest is hard coded here. Occasionally, it should be verified against
301+
# <earliestDatestamp> in: https://oaipmh.arxiv.org/oai?verb=Identify
302+
earliest_date = datetime(2005, 9, 16, tzinfo=timezone.utc)
303+
this_year = datetime.now(timezone.utc).year
304+
if args.years_back == -1:
305+
arg_date = earliest_date
306+
else:
307+
start_year = this_year - args.years_back
308+
arg_date = datetime(start_year, 1, 1, tzinfo=timezone.utc)
309+
if arg_date < earliest_date:
310+
arg_date = earliest_date
311+
args.from_date = arg_date.strftime("%Y-%m-%d")
312+
args.years_back = this_year - arg_date.year
313+
301314
return args
302315

303316

@@ -531,13 +544,8 @@ def query_arxiv(args):
531544
LOGGER.info("Beginning to fetch results from ArXiv OAI-PMH API")
532545
session = shared.get_session()
533546

534-
# Calculate date range for harvesting
535-
current_year = datetime.now().year
536-
start_year = current_year - args.years_back
537-
from_date = f"{start_year}-01-01"
538-
539547
LOGGER.info(
540-
f"Harvesting papers from {from_date} onwards "
548+
f"Harvesting papers from {args.from_date} onwards "
541549
f"({args.years_back} years back)"
542550
)
543551

@@ -550,7 +558,10 @@ def query_arxiv(args):
550558
total_fetched = 0
551559
resumption_token = None
552560

553-
while total_fetched < args.limit:
561+
# Proceed is set to False when limit reached or end of records (missing
562+
# resumption token)
563+
proceed = True
564+
while proceed:
554565
try:
555566
# Build OAI-PMH request URL
556567
if resumption_token:
@@ -564,7 +575,7 @@ def query_arxiv(args):
564575
query_params = {
565576
"verb": "ListRecords",
566577
"metadataPrefix": "arXiv",
567-
"from": from_date,
578+
"from": args.from_date,
568579
}
569580

570581
# Make API request
@@ -591,7 +602,8 @@ def query_arxiv(args):
591602
batch_cc_count = 0
592603

593604
for record in records:
594-
if total_fetched >= args.limit:
605+
if args.limit > 0 and args.limit <= total_fetched:
606+
proceed = False
595607
break
596608

597609
# Convert record to string for metadata extraction
@@ -628,12 +640,14 @@ def query_arxiv(args):
628640
resumption_element = root.find(
629641
".//{http://www.openarchives.org/OAI/2.0/}resumptionToken"
630642
)
631-
if resumption_element is not None and resumption_element.text:
643+
if not proceed:
644+
break
645+
elif resumption_element is not None and resumption_element.text:
632646
resumption_token = resumption_element.text
633647
LOGGER.info("Continuing with resumption token...")
634648
else:
635649
LOGGER.info("No more records available")
636-
break
650+
proceed = False
637651

638652
# OAI-PMH requires a 3 second delay between requests
639653
# https://info.arxiv.org/help/api/tou.html#rate-limits
@@ -657,7 +671,7 @@ def query_arxiv(args):
657671
# Save provenance
658672
provenance_data = {
659673
"total_fetched": total_fetched,
660-
"from_date": from_date,
674+
"from_date": args.from_date,
661675
"years_back": args.years_back,
662676
"limit": args.limit,
663677
"quarter": QUARTER,

0 commit comments

Comments
 (0)