1111import textwrap
1212import time
1313import traceback
14- import xml .etree .ElementTree as ET # XML parsing for OAI-PMH responses
1514from collections import Counter , defaultdict
1615from copy import copy
1716from datetime import datetime , timezone
2019# Third-party
2120import requests
2221import yaml
22+ from lxml import etree
2323from pygments import highlight
2424from pygments .formatters import TerminalFormatter
2525from pygments .lexers import PythonTracebackLexer
@@ -339,11 +339,7 @@ def extract_license_from_xml(record_xml):
339339 Extract CC license information from OAI-PMH XML record.
340340 Returns normalized license identifier or specific error indicator.
341341 """
342- try :
343- root = ET .fromstring (record_xml )
344- except ET .ParseError as e :
345- LOGGER .error (f"Licensing extraction failed: XML Parse Error: { e } " )
346- return "XML Parse Error"
342+ root = etree .fromstring (record_xml )
347343
348344 # Find license element in arXiv namespace
349345 license_element = root .find (".//{http://arxiv.org/OAI/arXiv/}license" )
@@ -371,11 +367,7 @@ def extract_metadata_from_xml(record_xml):
371367
372368 Returns dict with category, year, author_count, and license info.
373369 """
374- try :
375- root = ET .fromstring (record_xml )
376- except ET .ParseError as e :
377- LOGGER .error (f"Metadata extraction failed: XML Parse Error: { e } " )
378- return {}
370+ root = etree .fromstring (record_xml )
379371
380372 # Extract category (primary category from categories field)
381373 categories_elem = root .find (".//{http://arxiv.org/OAI/arXiv/}categories" )
@@ -471,10 +463,7 @@ def query_arxiv(args, session):
471463 except requests .RequestException as e :
472464 raise shared .QuantifyingException (f"Request Exception: { e } " , 1 )
473465
474- try :
475- root = ET .fromstring (response .content )
476- except ET .ParseError as e :
477- raise shared .QuantifyingException (f"XML Parse Error: { e } " , 1 )
466+ root = etree .fromstring (response .content )
478467
479468 # Check for errors
480469 error_element = root .find (
@@ -497,7 +486,7 @@ def query_arxiv(args, session):
497486 total_fetched += 1
498487
499488 # Convert record to string for metadata extraction
500- record_xml = ET .tostring (record , encoding = "unicode" )
489+ record_xml = etree .tostring (record , encoding = "unicode" )
501490 metadata = extract_metadata_from_xml (record_xml )
502491
503492 # Only process CC-licensed articles
0 commit comments