Skip to content

Commit 24bd796

Browse files
committed
remove --years-back option (always start from 2008-02-05 article addition date
1 parent f4d53b9 commit 24bd796

File tree

1 file changed

+10
-39
lines changed

1 file changed

+10
-39
lines changed

scripts/1-fetch/arxiv_fetch.py

Lines changed: 10 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,6 @@
1313
import traceback
1414
from collections import Counter, defaultdict
1515
from copy import copy
16-
from datetime import datetime, timezone
1716
from operator import itemgetter
1817

1918
# Third-party
@@ -37,7 +36,6 @@
3736
BASE_URL = "https://oaipmh.arxiv.org/oai"
3837
# Defaults should result in quick operation (not complete operation)
3938
DEFAULT_FETCH_LIMIT = 4500 # Fetch 3 batches of 1,500 articles each
40-
DEFAULT_YEARS_BACK = 5
4139
# CSV file paths
4240
FILE_ARXIV_AUTHOR_BUCKET = shared.path_join(
4341
PATHS["data_1-fetch"], "arxiv_4_count_by_author_bucket.csv"
@@ -91,40 +89,13 @@ def parse_arguments():
9189
default=DEFAULT_FETCH_LIMIT,
9290
help=(
9391
"Limit number of fetched articles (default:"
94-
f" {DEFAULT_FETCH_LIMIT}). Use a value of -1 to remove limit."
92+
f" {DEFAULT_FETCH_LIMIT}). Use a value of -1 to fetch all articles"
93+
" (remove limit)."
9594
),
9695
)
97-
parser.add_argument(
98-
"--years-back",
99-
type=int,
100-
default=DEFAULT_YEARS_BACK,
101-
help=(
102-
"Number of years back from current year to fetch (default:"
103-
f" {DEFAULT_YEARS_BACK}). Use a value of -1 to specify 2008-02-05"
104-
" (first date a CC licensed article was added)."
105-
),
106-
)
107-
10896
args = parser.parse_args()
10997
if not args.enable_save and args.enable_git:
11098
parser.error("--enable-git requires --enable-save")
111-
# Restrict args.years_back to earliest datetime and initialize
112-
# args.from_date
113-
#
114-
# Survey of records indicated the first CC licenced article was added on
115-
# 2008-02-05
116-
earliest_date = datetime(2008, 2, 5, tzinfo=timezone.utc)
117-
this_year = datetime.now(timezone.utc).year
118-
if args.years_back == -1:
119-
arg_date = earliest_date
120-
else:
121-
start_year = this_year - args.years_back
122-
arg_date = datetime(start_year, 1, 1, tzinfo=timezone.utc)
123-
if arg_date < earliest_date:
124-
arg_date = earliest_date
125-
args.from_date = arg_date.strftime("%Y-%m-%d")
126-
args.years_back = this_year - arg_date.year
127-
12899
return args
129100

130101

@@ -316,9 +287,12 @@ def query_arxiv(args, session):
316287
"""
317288
Query ArXiv OAI-PMH API and return information about CC licensed articles.
318289
"""
290+
if args.limit == -1:
291+
count_desc = "all"
292+
else:
293+
count_desc = f"a maximum of {args.limit}"
319294
LOGGER.info(
320-
f"Querying articles from {args.from_date} onwards ({args.years_back}"
321-
" years back)"
295+
f"Fetching {count_desc} articles starting form add date 2008-02-05"
322296
)
323297

324298
# Data structures for counting
@@ -353,7 +327,7 @@ def query_arxiv(args, session):
353327
params = {
354328
"verb": "ListRecords",
355329
"metadataPrefix": "arXiv",
356-
"from": args.from_date,
330+
"from": "2008-02-05", # First addition of CC licensed articles
357331
}
358332
verb = "starting"
359333

@@ -542,12 +516,9 @@ def write_provence(args, cc_articles_found):
542516
provenance_data = {
543517
"api_description": desc,
544518
"api_endpoint": BASE_URL,
545-
"arguments": {
546-
"from_date": args.from_date,
547-
"limit": args.limit,
548-
"years_back": args.years_back,
549-
},
550519
"cc_articles_found": cc_articles_found,
520+
"fetch_limit": args.limit,
521+
"from_add_date": "2008-02-05",
551522
"quarter": QUARTER,
552523
"script": os.path.basename(__file__),
553524
}

0 commit comments

Comments
 (0)