@@ -363,8 +363,25 @@ def extract_record_metadata(record):
363363 """
364364 Extract paper metadata from OAI-PMH XML record.
365365
366- Returns dict with category, year, author_count , and license info.
366+ Returns dict with author_count, category, year , and license info.
367367 """
368+
369+ # Extract license first to avoid unnecessary work
370+ license_info = extract_record_license (record )
371+ if not license_info .startswith ("CC" ):
372+ return {}
373+
374+ # Extract added on
375+ added_on_elem = record .find (
376+ ".//{http://www.openarchives.org/OAI/2.0/}datestamp"
377+ )
378+ if added_on_elem is not None and added_on_elem .text :
379+ added_on = added_on_elem .text .strip ()
380+
381+ # Extract author count
382+ authors = record .findall (".//{http://arxiv.org/OAI/arXiv/}author" )
383+ author_count = len (authors ) if authors else 0
384+
368385 # Extract category (primary category from categories field)
369386 categories_elem = record .find (".//{http://arxiv.org/OAI/arXiv/}categories" )
370387 if categories_elem is not None and categories_elem .text :
@@ -373,7 +390,7 @@ def extract_record_metadata(record):
373390 else :
374391 category = "Unknown"
375392
376- # Extract year from 1) updated date , 2) created date
393+ # Extract year from 1) updated, 2) created
377394 updated_elem = record .find (".//{http://arxiv.org/OAI/arXiv/}updated" )
378395 if updated_elem is not None and updated_elem .text :
379396 try :
@@ -396,19 +413,14 @@ def extract_record_metadata(record):
396413 else :
397414 year = "Unknown"
398415
399- # Extract author count
400- authors = record .findall (".//{http://arxiv.org/OAI/arXiv/}author" )
401- author_count = len (authors ) if authors else 0
402-
403- # Extract license
404- license_info = extract_record_license (record )
405-
406- return {
407- "category" : category ,
408- "year" : year ,
416+ metadata = {
417+ "added_on" : added_on ,
409418 "author_count" : author_count ,
419+ "category" : category ,
410420 "license" : license_info ,
421+ "year" : year ,
411422 }
423+ return metadata
412424
413425
414426def bucket_author_count (author_count ):
@@ -435,10 +447,10 @@ def query_arxiv(args, session):
435447 year_counts = defaultdict (lambda : defaultdict (int ))
436448 author_counts = defaultdict (lambda : defaultdict (int ))
437449
450+ batch = 1
438451 total_fetched = 0
439452 cc_articles_found = 0
440- max_year = False
441- min_year = False
453+ min_added_on = False
442454 resumption_token = None
443455
444456 # Proceed is set to False when limit reached or end of records (missing
@@ -462,7 +474,10 @@ def query_arxiv(args, session):
462474 verb = "starting"
463475
464476 # Make API request
465- LOGGER .info (f"Fetching batch { verb } from record { total_fetched } " )
477+ LOGGER .info (
478+ f"Fetching batch { batch } { verb } from record { total_fetched } "
479+ )
480+ batch += 1
466481
467482 try :
468483 # Build OAI-PMH request URL
@@ -484,7 +499,7 @@ def query_arxiv(args, session):
484499 f"OAI-PMH Error: { error_element .text } " , 1
485500 )
486501
487- # Process records
502+ # Process batch of article records
488503 records = root .findall (
489504 ".//{http://www.openarchives.org/OAI/2.0/}record"
490505 )
@@ -496,38 +511,34 @@ def query_arxiv(args, session):
496511 total_fetched += 1
497512
498513 metadata = extract_record_metadata (record )
499- # Only process CC-licensed articles
500- if metadata ["license" ].startswith ("CC" ):
501- license_info = metadata ["license" ]
502- category = metadata ["category" ]
503- year = metadata ["year" ]
504- author_count = metadata ["author_count" ]
505-
506- # Count by license
507- license_counts [license_info ] += 1
508-
509- # Count by category and license
510- category_counts [license_info ][category ] += 1
511-
512- # Count by year and license
513- year_counts [license_info ][year ] += 1
514-
515- # Count by author count and license
516- author_counts [license_info ][author_count ] += 1
517-
518- batch_cc_count += 1
519- cc_articles_found += 1
520- if not min_year or year < min_year :
521- min_year = year
522- if not max_year or year > max_year :
523- max_year = year
524-
525- if min_year == max_year :
526- LOGGER .info (f" Batch articles were added in { min_year } " )
527- else :
528- LOGGER .info (
529- f" Batch articles are from { min_year } through { max_year } "
530- )
514+ if not metadata : # Only true for CC licensed articles
515+ continue
516+ added_on = metadata ["added_on" ]
517+ if not min_added_on or added_on < min_added_on :
518+ min_added_on = added_on
519+
520+ license_info = metadata ["license" ]
521+
522+ # Count by author count and license
523+ author_count = metadata ["author_count" ]
524+ author_counts [license_info ][author_count ] += 1
525+
526+ # Count by category and license
527+ category = metadata ["category" ]
528+ category_counts [license_info ][category ] += 1
529+
530+ # Count by license
531+ license_counts [license_info ] += 1
532+
533+ # Count by year and license
534+ year = metadata ["year" ]
535+ year_counts [license_info ][year ] += 1
536+
537+ batch_cc_count += 1
538+ cc_articles_found += 1
539+
540+ if min_added_on :
541+ LOGGER .info (f" Earliest CC article addition: { min_added_on } " )
531542 LOGGER .info (
532543 f" Batch CC licensed articles: { batch_cc_count } , Total"
533544 f" CC-licensed articles: { cc_articles_found } "
@@ -544,6 +555,7 @@ def query_arxiv(args, session):
544555 else :
545556 LOGGER .info ("No more records available" )
546557 proceed = False
558+ break
547559
548560 # OAI-PMH requires a 3 second delay between requests
549561 # https://info.arxiv.org/help/api/tou.html#rate-limits
@@ -640,15 +652,18 @@ def write_provence(args, cc_articles_found):
640652 return args
641653
642654 # Save provenance
655+ desc = "Open Archives Initiative Protocol for Metadata Havesting (OAI-PMH)"
643656 provenance_data = {
657+ "api_description" : desc ,
658+ "api_endpoint" : BASE_URL ,
659+ "arguments" : {
660+ "from_date" : args .from_date ,
661+ "limit" : args .limit ,
662+ "years_back" : args .years_back ,
663+ },
644664 "cc_articles_found" : cc_articles_found ,
645- "from_date" : args .from_date ,
646- "years_back" : args .years_back ,
647- "limit" : args .limit ,
648665 "quarter" : QUARTER ,
649666 "script" : os .path .basename (__file__ ),
650- "api_endpoint" : BASE_URL ,
651- "method" : "OAI-PMH structured license harvesting" ,
652667 }
653668
654669 # Write provenance YAML for auditing
@@ -664,7 +679,6 @@ def write_provence(args, cc_articles_found):
664679
665680
666681def main ():
667- """Main function."""
668682 args = parse_arguments ()
669683 shared .paths_log (LOGGER , PATHS )
670684 shared .git_fetch_and_merge (args , PATHS ["repo" ])
0 commit comments