Skip to content

Commit 6071d7a

Browse files
committed
update earliest date and remove added on processing
1 parent 98e3dd4 commit 6071d7a

File tree

1 file changed

+20
-18
lines changed

1 file changed

+20
-18
lines changed

scripts/1-fetch/arxiv_fetch.py

Lines changed: 20 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -259,8 +259,8 @@ def parse_arguments():
259259
default=DEFAULT_YEARS_BACK,
260260
help=(
261261
"Number of years back from current year to fetch (default:"
262-
f" {DEFAULT_YEARS_BACK}). Use a value of -1 to specify"
263-
" <earliestDatestamp>."
262+
f" {DEFAULT_YEARS_BACK}). Use a value of -1 to specify 2008-02-05"
263+
" (first date a CC licensed article was added)."
264264
),
265265
)
266266

@@ -270,9 +270,9 @@ def parse_arguments():
270270
# Restrict args.years_back to earliest datetime and initialize
271271
# args.from_date
272272
#
273-
# Earliest is hard coded here. Occasionally, it should be verified against
274-
# <earliestDatestamp> in: https://oaipmh.arxiv.org/oai?verb=Identify
275-
earliest_date = datetime(2005, 9, 16, tzinfo=timezone.utc)
273+
# Survey of records indicated the first CC licenced article was added on
274+
# 2008-02-05
275+
earliest_date = datetime(2008, 2, 5, tzinfo=timezone.utc)
276276
this_year = datetime.now(timezone.utc).year
277277
if args.years_back == -1:
278278
arg_date = earliest_date
@@ -371,12 +371,12 @@ def extract_record_metadata(record):
371371
if not license_info.startswith("CC"):
372372
return {}
373373

374-
# Extract added on
375-
added_on_elem = record.find(
376-
".//{http://www.openarchives.org/OAI/2.0/}datestamp"
377-
)
378-
if added_on_elem is not None and added_on_elem.text:
379-
added_on = added_on_elem.text.strip()
374+
# # Extract added on
375+
# added_on_elem = record.find(
376+
# ".//{http://www.openarchives.org/OAI/2.0/}datestamp"
377+
# )
378+
# if added_on_elem is not None and added_on_elem.text:
379+
# added_on = added_on_elem.text.strip()
380380

381381
# Extract author count
382382
authors = record.findall(".//{http://arxiv.org/OAI/arXiv/}author")
@@ -414,7 +414,7 @@ def extract_record_metadata(record):
414414
year = "Unknown"
415415

416416
metadata = {
417-
"added_on": added_on,
417+
# "added_on": added_on,
418418
"author_count": author_count,
419419
"category": category,
420420
"license": license_info,
@@ -450,7 +450,7 @@ def query_arxiv(args, session):
450450
batch = 1
451451
total_fetched = 0
452452
cc_articles_found = 0
453-
min_added_on = False
453+
# min_added_on = False
454454
resumption_token = None
455455

456456
# Proceed is set to False when limit reached or end of records (missing
@@ -513,9 +513,10 @@ def query_arxiv(args, session):
513513
metadata = extract_record_metadata(record)
514514
if not metadata: # Only true for CC licensed articles
515515
continue
516-
added_on = metadata["added_on"]
517-
if not min_added_on or added_on < min_added_on:
518-
min_added_on = added_on
516+
517+
# added_on = metadata["added_on"]
518+
# if not min_added_on or added_on < min_added_on:
519+
# min_added_on = added_on
519520

520521
license_info = metadata["license"]
521522

@@ -537,8 +538,9 @@ def query_arxiv(args, session):
537538
batch_cc_count += 1
538539
cc_articles_found += 1
539540

540-
if min_added_on:
541-
LOGGER.info(f" Earliest CC article addition: {min_added_on}")
541+
# if min_added_on:
542+
# LOGGER.info(f" Earliest CC article addition: {min_added_on}")
543+
542544
LOGGER.info(
543545
f" Batch CC licensed articles: {batch_cc_count}, Total"
544546
f" CC-licensed articles: {cc_articles_found}"

0 commit comments

Comments
 (0)