Skip to content

Commit f4d53b9

Browse files
committed
query category names instead of using manual list
1 parent 6071d7a commit f4d53b9

File tree

1 file changed

+49
-164
lines changed

1 file changed

+49
-164
lines changed

scripts/1-fetch/arxiv_fetch.py

Lines changed: 49 additions & 164 deletions
Original file line numberDiff line numberDiff line change
@@ -36,166 +36,7 @@
3636
# Constants
3737
BASE_URL = "https://oaipmh.arxiv.org/oai"
3838
# Defaults should result in quick operation (not complete operation)
39-
# ArXiv Categories - manually curated from ArXiv official taxonomy
40-
# Source: https://arxiv.org/category_taxonomy
41-
CATEGORIES = {
42-
# Computer Science
43-
"cs.AI": "Artificial Intelligence",
44-
"cs.AR": "Hardware Architecture",
45-
"cs.CC": "Computational Complexity",
46-
"cs.CE": "Computational Engineering, Finance, and Science",
47-
"cs.CG": "Computational Geometry",
48-
"cs.CL": "Computation and Language",
49-
"cs.CR": "Cryptography and Security",
50-
"cs.CV": "Computer Vision and Pattern Recognition",
51-
"cs.CY": "Computers and Society",
52-
"cs.DB": "Databases",
53-
"cs.DC": "Distributed, Parallel, and Cluster Computing",
54-
"cs.DL": "Digital Libraries",
55-
"cs.DM": "Discrete Mathematics",
56-
"cs.DS": "Data Structures and Algorithms",
57-
"cs.ET": "Emerging Technologies",
58-
"cs.FL": "Formal Languages and Automata Theory",
59-
"cs.GL": "General Literature",
60-
"cs.GR": "Graphics",
61-
"cs.GT": "Computer Science and Game Theory",
62-
"cs.HC": "Human-Computer Interaction",
63-
"cs.IR": "Information Retrieval",
64-
"cs.IT": "Information Theory",
65-
"cs.LG": "Machine Learning",
66-
"cs.LO": "Logic in Computer Science",
67-
"cs.MA": "Multiagent Systems",
68-
"cs.MM": "Multimedia",
69-
"cs.MS": "Mathematical Software",
70-
"cs.NA": "Numerical Analysis",
71-
"cs.NE": "Neural and Evolutionary Computing",
72-
"cs.NI": "Networking and Internet Architecture",
73-
"cs.OH": "Other Computer Science",
74-
"cs.OS": "Operating Systems",
75-
"cs.PF": "Performance",
76-
"cs.PL": "Programming Languages",
77-
"cs.RO": "Robotics",
78-
"cs.SC": "Symbolic Computation",
79-
"cs.SD": "Sound",
80-
"cs.SE": "Software Engineering",
81-
"cs.SI": "Social and Information Networks",
82-
"cs.SY": "Systems and Control",
83-
# Mathematics
84-
"math.AC": "Commutative Algebra",
85-
"math.AG": "Algebraic Geometry",
86-
"math.AP": "Analysis of PDEs",
87-
"math.AT": "Algebraic Topology",
88-
"math.CA": "Classical Analysis and ODEs",
89-
"math.CO": "Combinatorics",
90-
"math.CT": "Category Theory",
91-
"math.CV": "Complex Variables",
92-
"math.DG": "Differential Geometry",
93-
"math.DS": "Dynamical Systems",
94-
"math.FA": "Functional Analysis",
95-
"math.GM": "General Mathematics",
96-
"math.GN": "General Topology",
97-
"math.GR": "Group Theory",
98-
"math.GT": "Geometric Topology",
99-
"math.HO": "History and Overview",
100-
"math.IT": "Information Theory",
101-
"math.KT": "K-Theory and Homology",
102-
"math.LO": "Logic",
103-
"math.MG": "Metric Geometry",
104-
"math.MP": "Mathematical Physics",
105-
"math.NA": "Numerical Analysis",
106-
"math.NT": "Number Theory",
107-
"math.OA": "Operator Algebras",
108-
"math.OC": "Optimization and Control",
109-
"math.PR": "Probability",
110-
"math.QA": "Quantum Algebra",
111-
"math.RA": "Rings and Algebras",
112-
"math.RT": "Representation Theory",
113-
"math.SG": "Symplectic Geometry",
114-
"math.SP": "Spectral Theory",
115-
"math.ST": "Statistics Theory",
116-
# Physics
117-
"physics.acc-ph": "Accelerator Physics",
118-
"physics.ao-ph": "Atmospheric and Oceanic Physics",
119-
"physics.app-ph": "Applied Physics",
120-
"physics.atm-clus": "Atomic and Molecular Clusters",
121-
"physics.atom-ph": "Atomic Physics",
122-
"physics.bio-ph": "Biological Physics",
123-
"physics.chem-ph": "Chemical Physics",
124-
"physics.class-ph": "Classical Physics",
125-
"physics.comp-ph": "Computational Physics",
126-
"physics.data-an": "Data Analysis, Statistics and Probability",
127-
"physics.ed-ph": "Physics Education",
128-
"physics.flu-dyn": "Fluid Dynamics",
129-
"physics.gen-ph": "General Physics",
130-
"physics.geo-ph": "Geophysics",
131-
"physics.hist-ph": "History and Philosophy of Physics",
132-
"physics.ins-det": "Instrumentation and Detectors",
133-
"physics.med-ph": "Medical Physics",
134-
"physics.optics": "Optics",
135-
"physics.plasm-ph": "Plasma Physics",
136-
"physics.pop-ph": "Popular Physics",
137-
"physics.soc-ph": "Physics and Society",
138-
"physics.space-ph": "Space Physics",
139-
# Statistics
140-
"stat.AP": "Applications",
141-
"stat.CO": "Computation",
142-
"stat.ME": "Methodology",
143-
"stat.ML": "Machine Learning",
144-
"stat.OT": "Other Statistics",
145-
"stat.TH": "Statistics Theory",
146-
# Quantitative Biology
147-
"q-bio.BM": "Biomolecules",
148-
"q-bio.CB": "Cell Behavior",
149-
"q-bio.GN": "Genomics",
150-
"q-bio.MN": "Molecular Networks",
151-
"q-bio.NC": "Neurons and Cognition",
152-
"q-bio.OT": "Other Quantitative Biology",
153-
"q-bio.PE": "Populations and Evolution",
154-
"q-bio.QM": "Quantitative Methods",
155-
"q-bio.SC": "Subcellular Processes",
156-
"q-bio.TO": "Tissues and Organs",
157-
# Economics
158-
"econ.EM": "Econometrics",
159-
"econ.GN": "General Economics",
160-
"econ.TH": "Theoretical Economics",
161-
# Electrical Engineering
162-
"eess.AS": "Audio and Speech Processing",
163-
"eess.IV": "Image and Video Processing",
164-
"eess.SP": "Signal Processing",
165-
"eess.SY": "Systems and Control",
166-
# High Energy Physics
167-
"hep-ex": "High Energy Physics - Experiment",
168-
"hep-lat": "High Energy Physics - Lattice",
169-
"hep-ph": "High Energy Physics - Phenomenology",
170-
"hep-th": "High Energy Physics - Theory",
171-
# Other Physics
172-
"astro-ph": "Astrophysics",
173-
"astro-ph.CO": "Cosmology and Nongalactic Astrophysics",
174-
"astro-ph.EP": "Earth and Planetary Astrophysics",
175-
"astro-ph.GA": "Astrophysics of Galaxies",
176-
"astro-ph.HE": "High Energy Astrophysical Phenomena",
177-
"astro-ph.IM": "Instrumentation and Methods for Astrophysics",
178-
"astro-ph.SR": "Solar and Stellar Astrophysics",
179-
"cond-mat.dis-nn": "Disordered Systems and Neural Networks",
180-
"cond-mat.mes-hall": "Mesoscale and Nanoscale Physics",
181-
"cond-mat.mtrl-sci": "Materials Science",
182-
"cond-mat.other": "Other Condensed Matter",
183-
"cond-mat.quant-gas": "Quantum Gases",
184-
"cond-mat.soft": "Soft Condensed Matter",
185-
"cond-mat.stat-mech": "Statistical Mechanics",
186-
"cond-mat.str-el": "Strongly Correlated Electrons",
187-
"cond-mat.supr-con": "Superconductivity",
188-
"gr-qc": "General Relativity and Quantum Cosmology",
189-
"nlin.AO": "Adaptation and Self-Organizing Systems",
190-
"nlin.CD": "Chaotic Dynamics",
191-
"nlin.CG": "Cellular Automata and Lattice Gases",
192-
"nlin.PS": "Pattern Formation and Solitons",
193-
"nlin.SI": "Exactly Solvable and Integrable Systems",
194-
"nucl-ex": "Nuclear Experiment",
195-
"nucl-th": "Nuclear Theory",
196-
"quant-ph": "Quantum Physics",
197-
}
198-
DEFAULT_FETCH_LIMIT = 1000
39+
DEFAULT_FETCH_LIMIT = 4500 # Fetch 3 batches of 1,500 articles each
19940
DEFAULT_YEARS_BACK = 5
20041
# CSV file paths
20142
FILE_ARXIV_AUTHOR_BUCKET = shared.path_join(
@@ -334,6 +175,45 @@ def get_license_mapping():
334175
)
335176

336177

178+
def query_category_mapping(args, session):
179+
"""
180+
Query to establish mapping of category codes and names.
181+
182+
Also see https://arxiv.org/category_taxonomy
183+
"""
184+
global CATEGORY_MAPPING
185+
186+
params = {"verb": "ListSets"}
187+
try:
188+
response = session.get(BASE_URL, params=params, timeout=60)
189+
response.raise_for_status()
190+
except requests.HTTPError as e:
191+
raise shared.QuantifyingException(f"HTTP Error: {e}", 1)
192+
except requests.RequestException as e:
193+
raise shared.QuantifyingException(f"Request Exception: {e}", 1)
194+
195+
root = etree.fromstring(response.content)
196+
CATEGORY_MAPPING = {}
197+
sets = root.findall(".//{http://www.openarchives.org/OAI/2.0/}set")
198+
for set_ in sets:
199+
spec, name = set_.getchildren()
200+
# Ensure category code (key) matches code used in articles
201+
spec_list = spec.text.split(":")
202+
if len(spec_list) > 1:
203+
# Remove parent category and replace colon with period
204+
# 3 part examples:
205+
# match:math:AC => math.AC
206+
# physics:astro-ph:CO => astro-ph.CO
207+
# 2 part examples
208+
# physics:astro-ph => astro-ph
209+
# physics:quant-ph => quant-ph
210+
spec_text = ".".join(spec_list[1:])
211+
else:
212+
spec_text = spec.text
213+
CATEGORY_MAPPING[spec_text] = name.text
214+
CATEGORY_MAPPING = dict(sorted(CATEGORY_MAPPING.items()))
215+
216+
337217
def extract_record_license(record):
338218
"""
339219
Extract CC license information from OAI-PMH XML record.
@@ -457,16 +337,20 @@ def query_arxiv(args, session):
457337
# resumption token)
458338
proceed = True
459339
while proceed:
340+
if args.limit > 0 and args.limit <= total_fetched:
341+
proceed = False
342+
break
343+
460344
if resumption_token:
461345
# Continue with resumption token
462-
query_params = {
346+
params = {
463347
"verb": "ListRecords",
464348
"resumptionToken": resumption_token,
465349
}
466350
verb = "resuming"
467351
else:
468352
# Initial request with date range
469-
query_params = {
353+
params = {
470354
"verb": "ListRecords",
471355
"metadataPrefix": "arXiv",
472356
"from": args.from_date,
@@ -481,7 +365,7 @@ def query_arxiv(args, session):
481365

482366
try:
483367
# Build OAI-PMH request URL
484-
response = session.get(BASE_URL, params=query_params, timeout=60)
368+
response = session.get(BASE_URL, params=params, timeout=60)
485369
response.raise_for_status()
486370
except requests.HTTPError as e:
487371
raise shared.QuantifyingException(f"HTTP Error: {e}", 1)
@@ -614,7 +498,7 @@ def write_data(args, data):
614498
rows = []
615499
for license_name, categories in data["category_counts"].items():
616500
for code, count in categories.items():
617-
label = CATEGORIES.get(code, code)
501+
label = CATEGORY_MAPPING.get(code, code)
618502
rows.append(
619503
{
620504
"TOOL_IDENTIFIER": license_name,
@@ -687,6 +571,7 @@ def main():
687571
initialize_all_data_files(args)
688572
get_license_mapping()
689573
session = shared.get_session()
574+
query_category_mapping(args, session)
690575
data, cc_articles_found = query_arxiv(args, session)
691576
write_data(args, data)
692577
write_provence(args, cc_articles_found)

0 commit comments

Comments
 (0)