Skip to content

Commit 6e11fcc

Browse files
committed
switch to much faster lxml for XML parsing
1 parent 35d868c commit 6e11fcc

File tree

3 files changed

+154
-32
lines changed

3 files changed

+154
-32
lines changed

Pipfile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ cachetools = "*" # Required by google-api-python-client
88
feedparser = "*"
99
GitPython = "*"
1010
google-api-python-client = "*"
11+
lxml = "*"
1112
matplotlib = "*"
1213
pandas = "*"
1314
Pygments = "*"

Pipfile.lock

Lines changed: 148 additions & 16 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

scripts/1-fetch/arxiv_fetch.py

Lines changed: 5 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,6 @@
1111
import textwrap
1212
import time
1313
import traceback
14-
import xml.etree.ElementTree as ET # XML parsing for OAI-PMH responses
1514
from collections import Counter, defaultdict
1615
from copy import copy
1716
from datetime import datetime, timezone
@@ -20,6 +19,7 @@
2019
# Third-party
2120
import requests
2221
import yaml
22+
from lxml import etree
2323
from pygments import highlight
2424
from pygments.formatters import TerminalFormatter
2525
from pygments.lexers import PythonTracebackLexer
@@ -339,11 +339,7 @@ def extract_license_from_xml(record_xml):
339339
Extract CC license information from OAI-PMH XML record.
340340
Returns normalized license identifier or specific error indicator.
341341
"""
342-
try:
343-
root = ET.fromstring(record_xml)
344-
except ET.ParseError as e:
345-
LOGGER.error(f"Licensing extraction failed: XML Parse Error: {e}")
346-
return "XML Parse Error"
342+
root = etree.fromstring(record_xml)
347343

348344
# Find license element in arXiv namespace
349345
license_element = root.find(".//{http://arxiv.org/OAI/arXiv/}license")
@@ -371,11 +367,7 @@ def extract_metadata_from_xml(record_xml):
371367
372368
Returns dict with category, year, author_count, and license info.
373369
"""
374-
try:
375-
root = ET.fromstring(record_xml)
376-
except ET.ParseError as e:
377-
LOGGER.error(f"Metadata extraction failed: XML Parse Error: {e}")
378-
return {}
370+
root = etree.fromstring(record_xml)
379371

380372
# Extract category (primary category from categories field)
381373
categories_elem = root.find(".//{http://arxiv.org/OAI/arXiv/}categories")
@@ -471,10 +463,7 @@ def query_arxiv(args, session):
471463
except requests.RequestException as e:
472464
raise shared.QuantifyingException(f"Request Exception: {e}", 1)
473465

474-
try:
475-
root = ET.fromstring(response.content)
476-
except ET.ParseError as e:
477-
raise shared.QuantifyingException(f"XML Parse Error: {e}", 1)
466+
root = etree.fromstring(response.content)
478467

479468
# Check for errors
480469
error_element = root.find(
@@ -497,7 +486,7 @@ def query_arxiv(args, session):
497486
total_fetched += 1
498487

499488
# Convert record to string for metadata extraction
500-
record_xml = ET.tostring(record, encoding="unicode")
489+
record_xml = etree.tostring(record, encoding="unicode")
501490
metadata = extract_metadata_from_xml(record_xml)
502491

503492
# Only process CC-licensed articles

0 commit comments

Comments
 (0)