3636# Constants
3737BASE_URL = "https://oaipmh.arxiv.org/oai"
3838# Defaults should result in quick operation (not complete operation)
39- # ArXiv Categories - manually curated from ArXiv official taxonomy
40- # Source: https://arxiv.org/category_taxonomy
41- CATEGORIES = {
42- # Computer Science
43- "cs.AI" : "Artificial Intelligence" ,
44- "cs.AR" : "Hardware Architecture" ,
45- "cs.CC" : "Computational Complexity" ,
46- "cs.CE" : "Computational Engineering, Finance, and Science" ,
47- "cs.CG" : "Computational Geometry" ,
48- "cs.CL" : "Computation and Language" ,
49- "cs.CR" : "Cryptography and Security" ,
50- "cs.CV" : "Computer Vision and Pattern Recognition" ,
51- "cs.CY" : "Computers and Society" ,
52- "cs.DB" : "Databases" ,
53- "cs.DC" : "Distributed, Parallel, and Cluster Computing" ,
54- "cs.DL" : "Digital Libraries" ,
55- "cs.DM" : "Discrete Mathematics" ,
56- "cs.DS" : "Data Structures and Algorithms" ,
57- "cs.ET" : "Emerging Technologies" ,
58- "cs.FL" : "Formal Languages and Automata Theory" ,
59- "cs.GL" : "General Literature" ,
60- "cs.GR" : "Graphics" ,
61- "cs.GT" : "Computer Science and Game Theory" ,
62- "cs.HC" : "Human-Computer Interaction" ,
63- "cs.IR" : "Information Retrieval" ,
64- "cs.IT" : "Information Theory" ,
65- "cs.LG" : "Machine Learning" ,
66- "cs.LO" : "Logic in Computer Science" ,
67- "cs.MA" : "Multiagent Systems" ,
68- "cs.MM" : "Multimedia" ,
69- "cs.MS" : "Mathematical Software" ,
70- "cs.NA" : "Numerical Analysis" ,
71- "cs.NE" : "Neural and Evolutionary Computing" ,
72- "cs.NI" : "Networking and Internet Architecture" ,
73- "cs.OH" : "Other Computer Science" ,
74- "cs.OS" : "Operating Systems" ,
75- "cs.PF" : "Performance" ,
76- "cs.PL" : "Programming Languages" ,
77- "cs.RO" : "Robotics" ,
78- "cs.SC" : "Symbolic Computation" ,
79- "cs.SD" : "Sound" ,
80- "cs.SE" : "Software Engineering" ,
81- "cs.SI" : "Social and Information Networks" ,
82- "cs.SY" : "Systems and Control" ,
83- # Mathematics
84- "math.AC" : "Commutative Algebra" ,
85- "math.AG" : "Algebraic Geometry" ,
86- "math.AP" : "Analysis of PDEs" ,
87- "math.AT" : "Algebraic Topology" ,
88- "math.CA" : "Classical Analysis and ODEs" ,
89- "math.CO" : "Combinatorics" ,
90- "math.CT" : "Category Theory" ,
91- "math.CV" : "Complex Variables" ,
92- "math.DG" : "Differential Geometry" ,
93- "math.DS" : "Dynamical Systems" ,
94- "math.FA" : "Functional Analysis" ,
95- "math.GM" : "General Mathematics" ,
96- "math.GN" : "General Topology" ,
97- "math.GR" : "Group Theory" ,
98- "math.GT" : "Geometric Topology" ,
99- "math.HO" : "History and Overview" ,
100- "math.IT" : "Information Theory" ,
101- "math.KT" : "K-Theory and Homology" ,
102- "math.LO" : "Logic" ,
103- "math.MG" : "Metric Geometry" ,
104- "math.MP" : "Mathematical Physics" ,
105- "math.NA" : "Numerical Analysis" ,
106- "math.NT" : "Number Theory" ,
107- "math.OA" : "Operator Algebras" ,
108- "math.OC" : "Optimization and Control" ,
109- "math.PR" : "Probability" ,
110- "math.QA" : "Quantum Algebra" ,
111- "math.RA" : "Rings and Algebras" ,
112- "math.RT" : "Representation Theory" ,
113- "math.SG" : "Symplectic Geometry" ,
114- "math.SP" : "Spectral Theory" ,
115- "math.ST" : "Statistics Theory" ,
116- # Physics
117- "physics.acc-ph" : "Accelerator Physics" ,
118- "physics.ao-ph" : "Atmospheric and Oceanic Physics" ,
119- "physics.app-ph" : "Applied Physics" ,
120- "physics.atm-clus" : "Atomic and Molecular Clusters" ,
121- "physics.atom-ph" : "Atomic Physics" ,
122- "physics.bio-ph" : "Biological Physics" ,
123- "physics.chem-ph" : "Chemical Physics" ,
124- "physics.class-ph" : "Classical Physics" ,
125- "physics.comp-ph" : "Computational Physics" ,
126- "physics.data-an" : "Data Analysis, Statistics and Probability" ,
127- "physics.ed-ph" : "Physics Education" ,
128- "physics.flu-dyn" : "Fluid Dynamics" ,
129- "physics.gen-ph" : "General Physics" ,
130- "physics.geo-ph" : "Geophysics" ,
131- "physics.hist-ph" : "History and Philosophy of Physics" ,
132- "physics.ins-det" : "Instrumentation and Detectors" ,
133- "physics.med-ph" : "Medical Physics" ,
134- "physics.optics" : "Optics" ,
135- "physics.plasm-ph" : "Plasma Physics" ,
136- "physics.pop-ph" : "Popular Physics" ,
137- "physics.soc-ph" : "Physics and Society" ,
138- "physics.space-ph" : "Space Physics" ,
139- # Statistics
140- "stat.AP" : "Applications" ,
141- "stat.CO" : "Computation" ,
142- "stat.ME" : "Methodology" ,
143- "stat.ML" : "Machine Learning" ,
144- "stat.OT" : "Other Statistics" ,
145- "stat.TH" : "Statistics Theory" ,
146- # Quantitative Biology
147- "q-bio.BM" : "Biomolecules" ,
148- "q-bio.CB" : "Cell Behavior" ,
149- "q-bio.GN" : "Genomics" ,
150- "q-bio.MN" : "Molecular Networks" ,
151- "q-bio.NC" : "Neurons and Cognition" ,
152- "q-bio.OT" : "Other Quantitative Biology" ,
153- "q-bio.PE" : "Populations and Evolution" ,
154- "q-bio.QM" : "Quantitative Methods" ,
155- "q-bio.SC" : "Subcellular Processes" ,
156- "q-bio.TO" : "Tissues and Organs" ,
157- # Economics
158- "econ.EM" : "Econometrics" ,
159- "econ.GN" : "General Economics" ,
160- "econ.TH" : "Theoretical Economics" ,
161- # Electrical Engineering
162- "eess.AS" : "Audio and Speech Processing" ,
163- "eess.IV" : "Image and Video Processing" ,
164- "eess.SP" : "Signal Processing" ,
165- "eess.SY" : "Systems and Control" ,
166- # High Energy Physics
167- "hep-ex" : "High Energy Physics - Experiment" ,
168- "hep-lat" : "High Energy Physics - Lattice" ,
169- "hep-ph" : "High Energy Physics - Phenomenology" ,
170- "hep-th" : "High Energy Physics - Theory" ,
171- # Other Physics
172- "astro-ph" : "Astrophysics" ,
173- "astro-ph.CO" : "Cosmology and Nongalactic Astrophysics" ,
174- "astro-ph.EP" : "Earth and Planetary Astrophysics" ,
175- "astro-ph.GA" : "Astrophysics of Galaxies" ,
176- "astro-ph.HE" : "High Energy Astrophysical Phenomena" ,
177- "astro-ph.IM" : "Instrumentation and Methods for Astrophysics" ,
178- "astro-ph.SR" : "Solar and Stellar Astrophysics" ,
179- "cond-mat.dis-nn" : "Disordered Systems and Neural Networks" ,
180- "cond-mat.mes-hall" : "Mesoscale and Nanoscale Physics" ,
181- "cond-mat.mtrl-sci" : "Materials Science" ,
182- "cond-mat.other" : "Other Condensed Matter" ,
183- "cond-mat.quant-gas" : "Quantum Gases" ,
184- "cond-mat.soft" : "Soft Condensed Matter" ,
185- "cond-mat.stat-mech" : "Statistical Mechanics" ,
186- "cond-mat.str-el" : "Strongly Correlated Electrons" ,
187- "cond-mat.supr-con" : "Superconductivity" ,
188- "gr-qc" : "General Relativity and Quantum Cosmology" ,
189- "nlin.AO" : "Adaptation and Self-Organizing Systems" ,
190- "nlin.CD" : "Chaotic Dynamics" ,
191- "nlin.CG" : "Cellular Automata and Lattice Gases" ,
192- "nlin.PS" : "Pattern Formation and Solitons" ,
193- "nlin.SI" : "Exactly Solvable and Integrable Systems" ,
194- "nucl-ex" : "Nuclear Experiment" ,
195- "nucl-th" : "Nuclear Theory" ,
196- "quant-ph" : "Quantum Physics" ,
197- }
198- DEFAULT_FETCH_LIMIT = 1000
39+ DEFAULT_FETCH_LIMIT = 4500 # Fetch 3 batches of 1,500 articles each
19940DEFAULT_YEARS_BACK = 5
20041# CSV file paths
20142FILE_ARXIV_AUTHOR_BUCKET = shared .path_join (
@@ -334,6 +175,45 @@ def get_license_mapping():
334175 )
335176
336177
178+ def query_category_mapping (args , session ):
179+ """
180+ Query to establish mapping of category codes and names.
181+
182+ Also see https://arxiv.org/category_taxonomy
183+ """
184+ global CATEGORY_MAPPING
185+
186+ params = {"verb" : "ListSets" }
187+ try :
188+ response = session .get (BASE_URL , params = params , timeout = 60 )
189+ response .raise_for_status ()
190+ except requests .HTTPError as e :
191+ raise shared .QuantifyingException (f"HTTP Error: { e } " , 1 )
192+ except requests .RequestException as e :
193+ raise shared .QuantifyingException (f"Request Exception: { e } " , 1 )
194+
195+ root = etree .fromstring (response .content )
196+ CATEGORY_MAPPING = {}
197+ sets = root .findall (".//{http://www.openarchives.org/OAI/2.0/}set" )
198+ for set_ in sets :
199+ spec , name = set_ .getchildren ()
200+ # Ensure category code (key) matches code used in articles
201+ spec_list = spec .text .split (":" )
202+ if len (spec_list ) > 1 :
203+ # Remove parent category and replace colon with period
204+ # 3 part examples:
205+ # match:math:AC => math.AC
206+ # physics:astro-ph:CO => astro-ph.CO
207+ # 2 part examples
208+ # physics:astro-ph => astro-ph
209+ # physics:quant-ph => quant-ph
210+ spec_text = "." .join (spec_list [1 :])
211+ else :
212+ spec_text = spec .text
213+ CATEGORY_MAPPING [spec_text ] = name .text
214+ CATEGORY_MAPPING = dict (sorted (CATEGORY_MAPPING .items ()))
215+
216+
337217def extract_record_license (record ):
338218 """
339219 Extract CC license information from OAI-PMH XML record.
@@ -457,16 +337,20 @@ def query_arxiv(args, session):
457337 # resumption token)
458338 proceed = True
459339 while proceed :
340+ if args .limit > 0 and args .limit <= total_fetched :
341+ proceed = False
342+ break
343+
460344 if resumption_token :
461345 # Continue with resumption token
462- query_params = {
346+ params = {
463347 "verb" : "ListRecords" ,
464348 "resumptionToken" : resumption_token ,
465349 }
466350 verb = "resuming"
467351 else :
468352 # Initial request with date range
469- query_params = {
353+ params = {
470354 "verb" : "ListRecords" ,
471355 "metadataPrefix" : "arXiv" ,
472356 "from" : args .from_date ,
@@ -481,7 +365,7 @@ def query_arxiv(args, session):
481365
482366 try :
483367 # Build OAI-PMH request URL
484- response = session .get (BASE_URL , params = query_params , timeout = 60 )
368+ response = session .get (BASE_URL , params = params , timeout = 60 )
485369 response .raise_for_status ()
486370 except requests .HTTPError as e :
487371 raise shared .QuantifyingException (f"HTTP Error: { e } " , 1 )
@@ -614,7 +498,7 @@ def write_data(args, data):
614498 rows = []
615499 for license_name , categories in data ["category_counts" ].items ():
616500 for code , count in categories .items ():
617- label = CATEGORIES .get (code , code )
501+ label = CATEGORY_MAPPING .get (code , code )
618502 rows .append (
619503 {
620504 "TOOL_IDENTIFIER" : license_name ,
@@ -687,6 +571,7 @@ def main():
687571 initialize_all_data_files (args )
688572 get_license_mapping ()
689573 session = shared .get_session ()
574+ query_category_mapping (args , session )
690575 data , cc_articles_found = query_arxiv (args , session )
691576 write_data (args , data )
692577 write_provence (args , cc_articles_found )
0 commit comments