Skip to content

Commit 98e3dd4

Browse files
committed
add added on, optimize processing, and organize data
1 parent 2618d2a commit 98e3dd4

File tree

1 file changed

+68
-54
lines changed

1 file changed

+68
-54
lines changed

scripts/1-fetch/arxiv_fetch.py

Lines changed: 68 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -363,8 +363,25 @@ def extract_record_metadata(record):
363363
"""
364364
Extract paper metadata from OAI-PMH XML record.
365365
366-
Returns dict with category, year, author_count, and license info.
366+
Returns dict with author_count, category, year, and license info.
367367
"""
368+
369+
# Extract license first to avoid unnecessary work
370+
license_info = extract_record_license(record)
371+
if not license_info.startswith("CC"):
372+
return {}
373+
374+
# Extract added on
375+
added_on_elem = record.find(
376+
".//{http://www.openarchives.org/OAI/2.0/}datestamp"
377+
)
378+
if added_on_elem is not None and added_on_elem.text:
379+
added_on = added_on_elem.text.strip()
380+
381+
# Extract author count
382+
authors = record.findall(".//{http://arxiv.org/OAI/arXiv/}author")
383+
author_count = len(authors) if authors else 0
384+
368385
# Extract category (primary category from categories field)
369386
categories_elem = record.find(".//{http://arxiv.org/OAI/arXiv/}categories")
370387
if categories_elem is not None and categories_elem.text:
@@ -373,7 +390,7 @@ def extract_record_metadata(record):
373390
else:
374391
category = "Unknown"
375392

376-
# Extract year from 1) updated date, 2) created date
393+
# Extract year from 1) updated, 2) created
377394
updated_elem = record.find(".//{http://arxiv.org/OAI/arXiv/}updated")
378395
if updated_elem is not None and updated_elem.text:
379396
try:
@@ -396,19 +413,14 @@ def extract_record_metadata(record):
396413
else:
397414
year = "Unknown"
398415

399-
# Extract author count
400-
authors = record.findall(".//{http://arxiv.org/OAI/arXiv/}author")
401-
author_count = len(authors) if authors else 0
402-
403-
# Extract license
404-
license_info = extract_record_license(record)
405-
406-
return {
407-
"category": category,
408-
"year": year,
416+
metadata = {
417+
"added_on": added_on,
409418
"author_count": author_count,
419+
"category": category,
410420
"license": license_info,
421+
"year": year,
411422
}
423+
return metadata
412424

413425

414426
def bucket_author_count(author_count):
@@ -435,10 +447,10 @@ def query_arxiv(args, session):
435447
year_counts = defaultdict(lambda: defaultdict(int))
436448
author_counts = defaultdict(lambda: defaultdict(int))
437449

450+
batch = 1
438451
total_fetched = 0
439452
cc_articles_found = 0
440-
max_year = False
441-
min_year = False
453+
min_added_on = False
442454
resumption_token = None
443455

444456
# Proceed is set to False when limit reached or end of records (missing
@@ -462,7 +474,10 @@ def query_arxiv(args, session):
462474
verb = "starting"
463475

464476
# Make API request
465-
LOGGER.info(f"Fetching batch {verb} from record {total_fetched}")
477+
LOGGER.info(
478+
f"Fetching batch {batch} {verb} from record {total_fetched}"
479+
)
480+
batch += 1
466481

467482
try:
468483
# Build OAI-PMH request URL
@@ -484,7 +499,7 @@ def query_arxiv(args, session):
484499
f"OAI-PMH Error: {error_element.text}", 1
485500
)
486501

487-
# Process records
502+
# Process batch of article records
488503
records = root.findall(
489504
".//{http://www.openarchives.org/OAI/2.0/}record"
490505
)
@@ -496,38 +511,34 @@ def query_arxiv(args, session):
496511
total_fetched += 1
497512

498513
metadata = extract_record_metadata(record)
499-
# Only process CC-licensed articles
500-
if metadata["license"].startswith("CC"):
501-
license_info = metadata["license"]
502-
category = metadata["category"]
503-
year = metadata["year"]
504-
author_count = metadata["author_count"]
505-
506-
# Count by license
507-
license_counts[license_info] += 1
508-
509-
# Count by category and license
510-
category_counts[license_info][category] += 1
511-
512-
# Count by year and license
513-
year_counts[license_info][year] += 1
514-
515-
# Count by author count and license
516-
author_counts[license_info][author_count] += 1
517-
518-
batch_cc_count += 1
519-
cc_articles_found += 1
520-
if not min_year or year < min_year:
521-
min_year = year
522-
if not max_year or year > max_year:
523-
max_year = year
524-
525-
if min_year == max_year:
526-
LOGGER.info(f" Batch articles were added in {min_year}")
527-
else:
528-
LOGGER.info(
529-
f" Batch articles are from {min_year} through {max_year}"
530-
)
514+
if not metadata: # Only true for CC licensed articles
515+
continue
516+
added_on = metadata["added_on"]
517+
if not min_added_on or added_on < min_added_on:
518+
min_added_on = added_on
519+
520+
license_info = metadata["license"]
521+
522+
# Count by author count and license
523+
author_count = metadata["author_count"]
524+
author_counts[license_info][author_count] += 1
525+
526+
# Count by category and license
527+
category = metadata["category"]
528+
category_counts[license_info][category] += 1
529+
530+
# Count by license
531+
license_counts[license_info] += 1
532+
533+
# Count by year and license
534+
year = metadata["year"]
535+
year_counts[license_info][year] += 1
536+
537+
batch_cc_count += 1
538+
cc_articles_found += 1
539+
540+
if min_added_on:
541+
LOGGER.info(f" Earliest CC article addition: {min_added_on}")
531542
LOGGER.info(
532543
f" Batch CC licensed articles: {batch_cc_count}, Total"
533544
f" CC-licensed articles: {cc_articles_found}"
@@ -544,6 +555,7 @@ def query_arxiv(args, session):
544555
else:
545556
LOGGER.info("No more records available")
546557
proceed = False
558+
break
547559

548560
# OAI-PMH requires a 3 second delay between requests
549561
# https://info.arxiv.org/help/api/tou.html#rate-limits
@@ -640,15 +652,18 @@ def write_provence(args, cc_articles_found):
640652
return args
641653

642654
# Save provenance
655+
desc = "Open Archives Initiative Protocol for Metadata Havesting (OAI-PMH)"
643656
provenance_data = {
657+
"api_description": desc,
658+
"api_endpoint": BASE_URL,
659+
"arguments": {
660+
"from_date": args.from_date,
661+
"limit": args.limit,
662+
"years_back": args.years_back,
663+
},
644664
"cc_articles_found": cc_articles_found,
645-
"from_date": args.from_date,
646-
"years_back": args.years_back,
647-
"limit": args.limit,
648665
"quarter": QUARTER,
649666
"script": os.path.basename(__file__),
650-
"api_endpoint": BASE_URL,
651-
"method": "OAI-PMH structured license harvesting",
652667
}
653668

654669
# Write provenance YAML for auditing
@@ -664,7 +679,6 @@ def write_provence(args, cc_articles_found):
664679

665680

666681
def main():
667-
"""Main function."""
668682
args = parse_arguments()
669683
shared.paths_log(LOGGER, PATHS)
670684
shared.git_fetch_and_merge(args, PATHS["repo"])

0 commit comments

Comments
 (0)