@@ -334,15 +334,13 @@ def get_license_mapping():
334334 )
335335
336336
337- def extract_license_from_xml ( record_xml ):
337+ def extract_record_license ( record ):
338338 """
339339 Extract CC license information from OAI-PMH XML record.
340340 Returns normalized license identifier or specific error indicator.
341341 """
342- root = etree .fromstring (record_xml )
343-
344342 # Find license element in arXiv namespace
345- license_element = root .find (".//{http://arxiv.org/OAI/arXiv/}license" )
343+ license_element = record .find (".//{http://arxiv.org/OAI/arXiv/}license" )
346344
347345 if license_element is not None and license_element .text :
348346 license_url = license_element .text .strip ()
@@ -361,39 +359,49 @@ def extract_license_from_xml(record_xml):
361359 return "No license field"
362360
363361
364- def extract_metadata_from_xml ( record_xml ):
362+ def extract_record_metadata ( record ):
365363 """
366364 Extract paper metadata from OAI-PMH XML record.
367365
368366 Returns dict with category, year, author_count, and license info.
369367 """
370- root = etree .fromstring (record_xml )
371-
372368 # Extract category (primary category from categories field)
373- categories_elem = root .find (".//{http://arxiv.org/OAI/arXiv/}categories" )
369+ categories_elem = record .find (".//{http://arxiv.org/OAI/arXiv/}categories" )
374370 if categories_elem is not None and categories_elem .text :
375371 # Take first category as primary
376372 category = categories_elem .text .strip ().split ()[0 ]
377373 else :
378374 category = "Unknown"
379375
380- # Extract year from created date
381- created_elem = root .find (".//{http://arxiv.org/OAI/arXiv/}created " )
382- if created_elem is not None and created_elem .text :
376+ # Extract year from 1) updated date, 2) created date
377+ updated_elem = record .find (".//{http://arxiv.org/OAI/arXiv/}updated " )
378+ if updated_elem is not None and updated_elem .text :
383379 try :
384- year = created_elem .text .strip ()[:4 ] # Extract year
380+ year = updated_elem .text .strip ()[:4 ] # Extract year
385381 except (AttributeError , IndexError ) as e :
386382 LOGGER .error (
387- f"Failed to extract year from '{ created_elem .text } ': { e } "
383+ f"Failed to extract year from '{ updated_elem .text } ': { e } "
388384 )
389385 year = "Unknown"
386+ else :
387+ created_elem = record .find (".//{http://arxiv.org/OAI/arXiv/}created" )
388+ if created_elem is not None and created_elem .text :
389+ try :
390+ year = created_elem .text .strip ()[:4 ] # Extract year
391+ except (AttributeError , IndexError ) as e :
392+ LOGGER .error (
393+ f"Failed to extract year from '{ created_elem .text } ': { e } "
394+ )
395+ year = "Unknown"
396+ else :
397+ year = "Unknown"
390398
391399 # Extract author count
392- authors = root .findall (".//{http://arxiv.org/OAI/arXiv/}author" )
400+ authors = record .findall (".//{http://arxiv.org/OAI/arXiv/}author" )
393401 author_count = len (authors ) if authors else 0
394402
395403 # Extract license
396- license_info = extract_license_from_xml ( record_xml )
404+ license_info = extract_record_license ( record )
397405
398406 return {
399407 "category" : category ,
@@ -429,6 +437,8 @@ def query_arxiv(args, session):
429437
430438 total_fetched = 0
431439 cc_articles_found = 0
440+ max_year = False
441+ min_year = False
432442 resumption_token = None
433443
434444 # Proceed is set to False when limit reached or end of records (missing
@@ -485,10 +495,7 @@ def query_arxiv(args, session):
485495 break
486496 total_fetched += 1
487497
488- # Convert record to string for metadata extraction
489- record_xml = etree .tostring (record , encoding = "unicode" )
490- metadata = extract_metadata_from_xml (record_xml )
491-
498+ metadata = extract_record_metadata (record )
492499 # Only process CC-licensed articles
493500 if metadata ["license" ].startswith ("CC" ):
494501 license_info = metadata ["license" ]
@@ -510,7 +517,17 @@ def query_arxiv(args, session):
510517
511518 batch_cc_count += 1
512519 cc_articles_found += 1
520+ if not min_year or year < min_year :
521+ min_year = year
522+ if not max_year or year > max_year :
523+ max_year = year
513524
525+ if min_year == max_year :
526+ LOGGER .info (f" Batch articles were added in { min_year } " )
527+ else :
528+ LOGGER .info (
529+ f" Batch articles are from { min_year } through { max_year } "
530+ )
514531 LOGGER .info (
515532 f" Batch CC licensed articles: { batch_cc_count } , Total"
516533 f" CC-licensed articles: { cc_articles_found } "
0 commit comments