Skip to content

Commit 076b95a

Browse files
committed
feat: centralize ArXiv category management in shared.py
1 parent 7defab5 commit 076b95a

1 file changed

Lines changed: 218 additions & 0 deletions

File tree

scripts/shared.py

Lines changed: 218 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
from datetime import datetime, timezone
55

66
# Third-party
7+
import yaml
78
from git import InvalidGitRepositoryError, NoSuchPathError, Repo
89
from pandas import PeriodIndex
910

@@ -161,6 +162,223 @@ def setup(current_file):
161162
return logger, paths
162163

163164

165+
def get_arxiv_categories():
166+
"""Get comprehensive ArXiv category taxonomy mapping."""
167+
# ArXiv's comprehensive category mapping
168+
categories = {
169+
# Computer Science
170+
"cs.AI": "Artificial Intelligence",
171+
"cs.AR": "Hardware Architecture",
172+
"cs.CC": "Computational Complexity",
173+
"cs.CE": "Computational Engineering, Finance, and Science",
174+
"cs.CG": "Computational Geometry",
175+
"cs.CL": "Computation and Language",
176+
"cs.CR": "Cryptography and Security",
177+
"cs.CV": "Computer Vision and Pattern Recognition",
178+
"cs.CY": "Computers and Society",
179+
"cs.DB": "Databases",
180+
"cs.DC": "Distributed, Parallel, and Cluster Computing",
181+
"cs.DL": "Digital Libraries",
182+
"cs.DM": "Discrete Mathematics",
183+
"cs.DS": "Data Structures and Algorithms",
184+
"cs.ET": "Emerging Technologies",
185+
"cs.FL": "Formal Languages and Automata Theory",
186+
"cs.GL": "General Literature",
187+
"cs.GR": "Graphics",
188+
"cs.GT": "Computer Science and Game Theory",
189+
"cs.HC": "Human-Computer Interaction",
190+
"cs.IR": "Information Retrieval",
191+
"cs.IT": "Information Theory",
192+
"cs.LG": "Machine Learning",
193+
"cs.LO": "Logic in Computer Science",
194+
"cs.MA": "Multiagent Systems",
195+
"cs.MM": "Multimedia",
196+
"cs.MS": "Mathematical Software",
197+
"cs.NA": "Numerical Analysis",
198+
"cs.NE": "Neural and Evolutionary Computing",
199+
"cs.NI": "Networking and Internet Architecture",
200+
"cs.OH": "Other Computer Science",
201+
"cs.OS": "Operating Systems",
202+
"cs.PF": "Performance",
203+
"cs.PL": "Programming Languages",
204+
"cs.RO": "Robotics",
205+
"cs.SC": "Symbolic Computation",
206+
"cs.SD": "Sound",
207+
"cs.SE": "Software Engineering",
208+
"cs.SI": "Social and Information Networks",
209+
"cs.SY": "Systems and Control",
210+
# Mathematics
211+
"math.AC": "Commutative Algebra",
212+
"math.AG": "Algebraic Geometry",
213+
"math.AP": "Analysis of PDEs",
214+
"math.AT": "Algebraic Topology",
215+
"math.CA": "Classical Analysis and ODEs",
216+
"math.CO": "Combinatorics",
217+
"math.CT": "Category Theory",
218+
"math.CV": "Complex Variables",
219+
"math.DG": "Differential Geometry",
220+
"math.DS": "Dynamical Systems",
221+
"math.FA": "Functional Analysis",
222+
"math.GM": "General Mathematics",
223+
"math.GN": "General Topology",
224+
"math.GR": "Group Theory",
225+
"math.GT": "Geometric Topology",
226+
"math.HO": "History and Overview",
227+
"math.IT": "Information Theory",
228+
"math.KT": "K-Theory and Homology",
229+
"math.LO": "Logic",
230+
"math.MG": "Metric Geometry",
231+
"math.MP": "Mathematical Physics",
232+
"math.NA": "Numerical Analysis",
233+
"math.NT": "Number Theory",
234+
"math.OA": "Operator Algebras",
235+
"math.OC": "Optimization and Control",
236+
"math.PR": "Probability",
237+
"math.QA": "Quantum Algebra",
238+
"math.RA": "Rings and Algebras",
239+
"math.RT": "Representation Theory",
240+
"math.SG": "Symplectic Geometry",
241+
"math.SP": "Spectral Theory",
242+
"math.ST": "Statistics Theory",
243+
# Physics
244+
"physics.acc-ph": "Accelerator Physics",
245+
"physics.ao-ph": "Atmospheric and Oceanic Physics",
246+
"physics.app-ph": "Applied Physics",
247+
"physics.atm-clus": "Atomic and Molecular Clusters",
248+
"physics.atom-ph": "Atomic Physics",
249+
"physics.bio-ph": "Biological Physics",
250+
"physics.chem-ph": "Chemical Physics",
251+
"physics.class-ph": "Classical Physics",
252+
"physics.comp-ph": "Computational Physics",
253+
"physics.data-an": "Data Analysis, Statistics and Probability",
254+
"physics.ed-ph": "Physics Education",
255+
"physics.flu-dyn": "Fluid Dynamics",
256+
"physics.gen-ph": "General Physics",
257+
"physics.geo-ph": "Geophysics",
258+
"physics.hist-ph": "History and Philosophy of Physics",
259+
"physics.ins-det": "Instrumentation and Detectors",
260+
"physics.med-ph": "Medical Physics",
261+
"physics.optics": "Optics",
262+
"physics.plasm-ph": "Plasma Physics",
263+
"physics.pop-ph": "Popular Physics",
264+
"physics.soc-ph": "Physics and Society",
265+
"physics.space-ph": "Space Physics",
266+
# Statistics
267+
"stat.AP": "Applications",
268+
"stat.CO": "Computation",
269+
"stat.ME": "Methodology",
270+
"stat.ML": "Machine Learning",
271+
"stat.OT": "Other Statistics",
272+
"stat.TH": "Statistics Theory",
273+
# Quantitative Biology
274+
"q-bio.BM": "Biomolecules",
275+
"q-bio.CB": "Cell Behavior",
276+
"q-bio.GN": "Genomics",
277+
"q-bio.MN": "Molecular Networks",
278+
"q-bio.NC": "Neurons and Cognition",
279+
"q-bio.OT": "Other Quantitative Biology",
280+
"q-bio.PE": "Populations and Evolution",
281+
"q-bio.QM": "Quantitative Methods",
282+
"q-bio.SC": "Subcellular Processes",
283+
"q-bio.TO": "Tissues and Organs",
284+
# Economics
285+
"econ.EM": "Econometrics",
286+
"econ.GN": "General Economics",
287+
"econ.TH": "Theoretical Economics",
288+
# Electrical Engineering
289+
"eess.AS": "Audio and Speech Processing",
290+
"eess.IV": "Image and Video Processing",
291+
"eess.SP": "Signal Processing",
292+
"eess.SY": "Systems and Control",
293+
# High Energy Physics
294+
"hep-ex": "High Energy Physics - Experiment",
295+
"hep-lat": "High Energy Physics - Lattice",
296+
"hep-ph": "High Energy Physics - Phenomenology",
297+
"hep-th": "High Energy Physics - Theory",
298+
# Other Physics
299+
"astro-ph": "Astrophysics",
300+
"astro-ph.CO": "Cosmology and Nongalactic Astrophysics",
301+
"astro-ph.EP": "Earth and Planetary Astrophysics",
302+
"astro-ph.GA": "Astrophysics of Galaxies",
303+
"astro-ph.HE": "High Energy Astrophysical Phenomena",
304+
"astro-ph.IM": "Instrumentation and Methods for Astrophysics",
305+
"astro-ph.SR": "Solar and Stellar Astrophysics",
306+
"cond-mat.dis-nn": "Disordered Systems and Neural Networks",
307+
"cond-mat.mes-hall": "Mesoscale and Nanoscale Physics",
308+
"cond-mat.mtrl-sci": "Materials Science",
309+
"cond-mat.other": "Other Condensed Matter",
310+
"cond-mat.quant-gas": "Quantum Gases",
311+
"cond-mat.soft": "Soft Condensed Matter",
312+
"cond-mat.stat-mech": "Statistical Mechanics",
313+
"cond-mat.str-el": "Strongly Correlated Electrons",
314+
"cond-mat.supr-con": "Superconductivity",
315+
"gr-qc": "General Relativity and Quantum Cosmology",
316+
"nlin.AO": "Adaptation and Self-Organizing Systems",
317+
"nlin.CD": "Chaotic Dynamics",
318+
"nlin.CG": "Cellular Automata and Lattice Gases",
319+
"nlin.PS": "Pattern Formation and Solitons",
320+
"nlin.SI": "Exactly Solvable and Integrable Systems",
321+
"nucl-ex": "Nuclear Experiment",
322+
"nucl-th": "Nuclear Theory",
323+
"quant-ph": "Quantum Physics",
324+
}
325+
326+
return categories
327+
328+
329+
def load_arxiv_categories(data_dir=None):
330+
"""Load ArXiv category mappings with fallback to comprehensive mapping."""
331+
categories = {}
332+
333+
# Try loading from YAML file first
334+
if data_dir:
335+
yaml_path = os.path.join(data_dir, "arxiv_category_map.yaml")
336+
if os.path.exists(yaml_path):
337+
try:
338+
with open(yaml_path, "r", encoding="utf-8") as f:
339+
categories = yaml.safe_load(f) or {}
340+
logging.info(
341+
f"Loaded {len(categories)} categories from {yaml_path}"
342+
)
343+
except (yaml.YAMLError, IOError, OSError) as e:
344+
logging.warning(f"Failed to load category YAML: {e}")
345+
346+
# Fallback to comprehensive mapping if no local categories
347+
if not categories:
348+
categories = get_arxiv_categories()
349+
if categories and data_dir:
350+
# Save fetched categories for future use
351+
try:
352+
os.makedirs(data_dir, exist_ok=True)
353+
yaml_path = os.path.join(data_dir, "arxiv_category_map.yaml")
354+
with open(yaml_path, "w", encoding="utf-8") as f:
355+
yaml.dump(
356+
categories, f, default_flow_style=False, sort_keys=True
357+
)
358+
logging.info(
359+
f"Saved {len(categories)} categories to {yaml_path}"
360+
)
361+
except (yaml.YAMLError, IOError, OSError) as e:
362+
logging.warning(f"Failed to save categories: {e}")
363+
364+
return categories
365+
366+
367+
def normalize_arxiv_category(code, categories=None):
368+
"""Convert category code to human-readable label."""
369+
if not code or code == "Unknown":
370+
return code
371+
372+
if categories and code in categories:
373+
return categories[code]
374+
375+
# Fallback: use uppercase first part of code
376+
if "." in code:
377+
return code.split(".")[0].upper()
378+
379+
return code
380+
381+
164382
def update_readme(
165383
args,
166384
section_title,

0 commit comments

Comments
 (0)