Skip to content

Commit 2618d2a

Browse files
committed
improve data handling and date handling
- avoid switching back and forth from string and Element object - check updated date first, then created date
1 parent 6e11fcc commit 2618d2a

File tree

1 file changed

+36
-19
lines changed

1 file changed

+36
-19
lines changed

scripts/1-fetch/arxiv_fetch.py

Lines changed: 36 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -334,15 +334,13 @@ def get_license_mapping():
334334
)
335335

336336

337-
def extract_license_from_xml(record_xml):
337+
def extract_record_license(record):
338338
"""
339339
Extract CC license information from OAI-PMH XML record.
340340
Returns normalized license identifier or specific error indicator.
341341
"""
342-
root = etree.fromstring(record_xml)
343-
344342
# Find license element in arXiv namespace
345-
license_element = root.find(".//{http://arxiv.org/OAI/arXiv/}license")
343+
license_element = record.find(".//{http://arxiv.org/OAI/arXiv/}license")
346344

347345
if license_element is not None and license_element.text:
348346
license_url = license_element.text.strip()
@@ -361,39 +359,49 @@ def extract_license_from_xml(record_xml):
361359
return "No license field"
362360

363361

364-
def extract_metadata_from_xml(record_xml):
362+
def extract_record_metadata(record):
365363
"""
366364
Extract paper metadata from OAI-PMH XML record.
367365
368366
Returns dict with category, year, author_count, and license info.
369367
"""
370-
root = etree.fromstring(record_xml)
371-
372368
# Extract category (primary category from categories field)
373-
categories_elem = root.find(".//{http://arxiv.org/OAI/arXiv/}categories")
369+
categories_elem = record.find(".//{http://arxiv.org/OAI/arXiv/}categories")
374370
if categories_elem is not None and categories_elem.text:
375371
# Take first category as primary
376372
category = categories_elem.text.strip().split()[0]
377373
else:
378374
category = "Unknown"
379375

380-
# Extract year from created date
381-
created_elem = root.find(".//{http://arxiv.org/OAI/arXiv/}created")
382-
if created_elem is not None and created_elem.text:
376+
# Extract year from 1) updated date, 2) created date
377+
updated_elem = record.find(".//{http://arxiv.org/OAI/arXiv/}updated")
378+
if updated_elem is not None and updated_elem.text:
383379
try:
384-
year = created_elem.text.strip()[:4] # Extract year
380+
year = updated_elem.text.strip()[:4] # Extract year
385381
except (AttributeError, IndexError) as e:
386382
LOGGER.error(
387-
f"Failed to extract year from '{created_elem.text}': {e}"
383+
f"Failed to extract year from '{updated_elem.text}': {e}"
388384
)
389385
year = "Unknown"
386+
else:
387+
created_elem = record.find(".//{http://arxiv.org/OAI/arXiv/}created")
388+
if created_elem is not None and created_elem.text:
389+
try:
390+
year = created_elem.text.strip()[:4] # Extract year
391+
except (AttributeError, IndexError) as e:
392+
LOGGER.error(
393+
f"Failed to extract year from '{created_elem.text}': {e}"
394+
)
395+
year = "Unknown"
396+
else:
397+
year = "Unknown"
390398

391399
# Extract author count
392-
authors = root.findall(".//{http://arxiv.org/OAI/arXiv/}author")
400+
authors = record.findall(".//{http://arxiv.org/OAI/arXiv/}author")
393401
author_count = len(authors) if authors else 0
394402

395403
# Extract license
396-
license_info = extract_license_from_xml(record_xml)
404+
license_info = extract_record_license(record)
397405

398406
return {
399407
"category": category,
@@ -429,6 +437,8 @@ def query_arxiv(args, session):
429437

430438
total_fetched = 0
431439
cc_articles_found = 0
440+
max_year = False
441+
min_year = False
432442
resumption_token = None
433443

434444
# Proceed is set to False when limit reached or end of records (missing
@@ -485,10 +495,7 @@ def query_arxiv(args, session):
485495
break
486496
total_fetched += 1
487497

488-
# Convert record to string for metadata extraction
489-
record_xml = etree.tostring(record, encoding="unicode")
490-
metadata = extract_metadata_from_xml(record_xml)
491-
498+
metadata = extract_record_metadata(record)
492499
# Only process CC-licensed articles
493500
if metadata["license"].startswith("CC"):
494501
license_info = metadata["license"]
@@ -510,7 +517,17 @@ def query_arxiv(args, session):
510517

511518
batch_cc_count += 1
512519
cc_articles_found += 1
520+
if not min_year or year < min_year:
521+
min_year = year
522+
if not max_year or year > max_year:
523+
max_year = year
513524

525+
if min_year == max_year:
526+
LOGGER.info(f" Batch articles were added in {min_year}")
527+
else:
528+
LOGGER.info(
529+
f" Batch articles are from {min_year} through {max_year}"
530+
)
514531
LOGGER.info(
515532
f" Batch CC licensed articles: {batch_cc_count}, Total"
516533
f" CC-licensed articles: {cc_articles_found}"

0 commit comments

Comments
 (0)