Skip to content

Commit 9993111

Browse files
committed
Revert shared.py to pre-category state, make arxiv_fetch.py fully self-contained
1 parent 37214af commit 9993111

2 files changed

Lines changed: 2 additions & 220 deletions

File tree

scripts/1-fetch/arxiv_fetch.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -472,7 +472,7 @@ def save_count_data(
472472
for lic, cats in category_counts.items():
473473
total_for_license = sum(cats.values()) or 1
474474
for code, c in cats.items():
475-
label = shared.normalize_arxiv_category(code, CATEGORIES)
475+
label = CATEGORIES.get(code, code)
476476
pct = round((c / total_for_license) * 100, 2)
477477
writer.writerow(
478478
{
@@ -510,7 +510,7 @@ def save_count_data(
510510
others = sorted_cats[TOP_N:]
511511
other_count = sum(c for _, c in others)
512512
for code, c in top:
513-
label = shared.normalize_arxiv_category(code, CATEGORIES)
513+
label = CATEGORIES.get(code, code)
514514
writer.writerow(
515515
{
516516
"TOOL_IDENTIFIER": lic,

scripts/shared.py

Lines changed: 0 additions & 218 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@
44
from datetime import datetime, timezone
55

66
# Third-party
7-
import yaml
87
from git import InvalidGitRepositoryError, NoSuchPathError, Repo
98
from pandas import PeriodIndex
109

@@ -162,223 +161,6 @@ def setup(current_file):
162161
return logger, paths
163162

164163

165-
def get_arxiv_categories():
166-
"""Get comprehensive ArXiv category taxonomy mapping."""
167-
# ArXiv's comprehensive category mapping
168-
categories = {
169-
# Computer Science
170-
"cs.AI": "Artificial Intelligence",
171-
"cs.AR": "Hardware Architecture",
172-
"cs.CC": "Computational Complexity",
173-
"cs.CE": "Computational Engineering, Finance, and Science",
174-
"cs.CG": "Computational Geometry",
175-
"cs.CL": "Computation and Language",
176-
"cs.CR": "Cryptography and Security",
177-
"cs.CV": "Computer Vision and Pattern Recognition",
178-
"cs.CY": "Computers and Society",
179-
"cs.DB": "Databases",
180-
"cs.DC": "Distributed, Parallel, and Cluster Computing",
181-
"cs.DL": "Digital Libraries",
182-
"cs.DM": "Discrete Mathematics",
183-
"cs.DS": "Data Structures and Algorithms",
184-
"cs.ET": "Emerging Technologies",
185-
"cs.FL": "Formal Languages and Automata Theory",
186-
"cs.GL": "General Literature",
187-
"cs.GR": "Graphics",
188-
"cs.GT": "Computer Science and Game Theory",
189-
"cs.HC": "Human-Computer Interaction",
190-
"cs.IR": "Information Retrieval",
191-
"cs.IT": "Information Theory",
192-
"cs.LG": "Machine Learning",
193-
"cs.LO": "Logic in Computer Science",
194-
"cs.MA": "Multiagent Systems",
195-
"cs.MM": "Multimedia",
196-
"cs.MS": "Mathematical Software",
197-
"cs.NA": "Numerical Analysis",
198-
"cs.NE": "Neural and Evolutionary Computing",
199-
"cs.NI": "Networking and Internet Architecture",
200-
"cs.OH": "Other Computer Science",
201-
"cs.OS": "Operating Systems",
202-
"cs.PF": "Performance",
203-
"cs.PL": "Programming Languages",
204-
"cs.RO": "Robotics",
205-
"cs.SC": "Symbolic Computation",
206-
"cs.SD": "Sound",
207-
"cs.SE": "Software Engineering",
208-
"cs.SI": "Social and Information Networks",
209-
"cs.SY": "Systems and Control",
210-
# Mathematics
211-
"math.AC": "Commutative Algebra",
212-
"math.AG": "Algebraic Geometry",
213-
"math.AP": "Analysis of PDEs",
214-
"math.AT": "Algebraic Topology",
215-
"math.CA": "Classical Analysis and ODEs",
216-
"math.CO": "Combinatorics",
217-
"math.CT": "Category Theory",
218-
"math.CV": "Complex Variables",
219-
"math.DG": "Differential Geometry",
220-
"math.DS": "Dynamical Systems",
221-
"math.FA": "Functional Analysis",
222-
"math.GM": "General Mathematics",
223-
"math.GN": "General Topology",
224-
"math.GR": "Group Theory",
225-
"math.GT": "Geometric Topology",
226-
"math.HO": "History and Overview",
227-
"math.IT": "Information Theory",
228-
"math.KT": "K-Theory and Homology",
229-
"math.LO": "Logic",
230-
"math.MG": "Metric Geometry",
231-
"math.MP": "Mathematical Physics",
232-
"math.NA": "Numerical Analysis",
233-
"math.NT": "Number Theory",
234-
"math.OA": "Operator Algebras",
235-
"math.OC": "Optimization and Control",
236-
"math.PR": "Probability",
237-
"math.QA": "Quantum Algebra",
238-
"math.RA": "Rings and Algebras",
239-
"math.RT": "Representation Theory",
240-
"math.SG": "Symplectic Geometry",
241-
"math.SP": "Spectral Theory",
242-
"math.ST": "Statistics Theory",
243-
# Physics
244-
"physics.acc-ph": "Accelerator Physics",
245-
"physics.ao-ph": "Atmospheric and Oceanic Physics",
246-
"physics.app-ph": "Applied Physics",
247-
"physics.atm-clus": "Atomic and Molecular Clusters",
248-
"physics.atom-ph": "Atomic Physics",
249-
"physics.bio-ph": "Biological Physics",
250-
"physics.chem-ph": "Chemical Physics",
251-
"physics.class-ph": "Classical Physics",
252-
"physics.comp-ph": "Computational Physics",
253-
"physics.data-an": "Data Analysis, Statistics and Probability",
254-
"physics.ed-ph": "Physics Education",
255-
"physics.flu-dyn": "Fluid Dynamics",
256-
"physics.gen-ph": "General Physics",
257-
"physics.geo-ph": "Geophysics",
258-
"physics.hist-ph": "History and Philosophy of Physics",
259-
"physics.ins-det": "Instrumentation and Detectors",
260-
"physics.med-ph": "Medical Physics",
261-
"physics.optics": "Optics",
262-
"physics.plasm-ph": "Plasma Physics",
263-
"physics.pop-ph": "Popular Physics",
264-
"physics.soc-ph": "Physics and Society",
265-
"physics.space-ph": "Space Physics",
266-
# Statistics
267-
"stat.AP": "Applications",
268-
"stat.CO": "Computation",
269-
"stat.ME": "Methodology",
270-
"stat.ML": "Machine Learning",
271-
"stat.OT": "Other Statistics",
272-
"stat.TH": "Statistics Theory",
273-
# Quantitative Biology
274-
"q-bio.BM": "Biomolecules",
275-
"q-bio.CB": "Cell Behavior",
276-
"q-bio.GN": "Genomics",
277-
"q-bio.MN": "Molecular Networks",
278-
"q-bio.NC": "Neurons and Cognition",
279-
"q-bio.OT": "Other Quantitative Biology",
280-
"q-bio.PE": "Populations and Evolution",
281-
"q-bio.QM": "Quantitative Methods",
282-
"q-bio.SC": "Subcellular Processes",
283-
"q-bio.TO": "Tissues and Organs",
284-
# Economics
285-
"econ.EM": "Econometrics",
286-
"econ.GN": "General Economics",
287-
"econ.TH": "Theoretical Economics",
288-
# Electrical Engineering
289-
"eess.AS": "Audio and Speech Processing",
290-
"eess.IV": "Image and Video Processing",
291-
"eess.SP": "Signal Processing",
292-
"eess.SY": "Systems and Control",
293-
# High Energy Physics
294-
"hep-ex": "High Energy Physics - Experiment",
295-
"hep-lat": "High Energy Physics - Lattice",
296-
"hep-ph": "High Energy Physics - Phenomenology",
297-
"hep-th": "High Energy Physics - Theory",
298-
# Other Physics
299-
"astro-ph": "Astrophysics",
300-
"astro-ph.CO": "Cosmology and Nongalactic Astrophysics",
301-
"astro-ph.EP": "Earth and Planetary Astrophysics",
302-
"astro-ph.GA": "Astrophysics of Galaxies",
303-
"astro-ph.HE": "High Energy Astrophysical Phenomena",
304-
"astro-ph.IM": "Instrumentation and Methods for Astrophysics",
305-
"astro-ph.SR": "Solar and Stellar Astrophysics",
306-
"cond-mat.dis-nn": "Disordered Systems and Neural Networks",
307-
"cond-mat.mes-hall": "Mesoscale and Nanoscale Physics",
308-
"cond-mat.mtrl-sci": "Materials Science",
309-
"cond-mat.other": "Other Condensed Matter",
310-
"cond-mat.quant-gas": "Quantum Gases",
311-
"cond-mat.soft": "Soft Condensed Matter",
312-
"cond-mat.stat-mech": "Statistical Mechanics",
313-
"cond-mat.str-el": "Strongly Correlated Electrons",
314-
"cond-mat.supr-con": "Superconductivity",
315-
"gr-qc": "General Relativity and Quantum Cosmology",
316-
"nlin.AO": "Adaptation and Self-Organizing Systems",
317-
"nlin.CD": "Chaotic Dynamics",
318-
"nlin.CG": "Cellular Automata and Lattice Gases",
319-
"nlin.PS": "Pattern Formation and Solitons",
320-
"nlin.SI": "Exactly Solvable and Integrable Systems",
321-
"nucl-ex": "Nuclear Experiment",
322-
"nucl-th": "Nuclear Theory",
323-
"quant-ph": "Quantum Physics",
324-
}
325-
326-
return categories
327-
328-
329-
def load_arxiv_categories(data_dir=None):
330-
"""Load ArXiv category mappings with fallback to comprehensive mapping."""
331-
categories = {}
332-
333-
# Try loading from YAML file first
334-
if data_dir:
335-
yaml_path = os.path.join(data_dir, "arxiv_category_map.yaml")
336-
if os.path.exists(yaml_path):
337-
try:
338-
with open(yaml_path, "r", encoding="utf-8") as f:
339-
categories = yaml.safe_load(f) or {}
340-
logging.info(
341-
f"Loaded {len(categories)} categories from {yaml_path}"
342-
)
343-
except (yaml.YAMLError, IOError, OSError) as e:
344-
logging.warning(f"Failed to load category YAML: {e}")
345-
346-
# Fallback to comprehensive mapping if no local categories
347-
if not categories:
348-
categories = get_arxiv_categories()
349-
if categories and data_dir:
350-
# Save fetched categories for future use
351-
try:
352-
os.makedirs(data_dir, exist_ok=True)
353-
yaml_path = os.path.join(data_dir, "arxiv_category_map.yaml")
354-
with open(yaml_path, "w", encoding="utf-8") as f:
355-
yaml.dump(
356-
categories, f, default_flow_style=False, sort_keys=True
357-
)
358-
logging.info(
359-
f"Saved {len(categories)} categories to {yaml_path}"
360-
)
361-
except (yaml.YAMLError, IOError, OSError) as e:
362-
logging.warning(f"Failed to save categories: {e}")
363-
364-
return categories
365-
366-
367-
def normalize_arxiv_category(code, categories=None):
368-
"""Convert category code to human-readable label."""
369-
if not code or code == "Unknown":
370-
return code
371-
372-
if categories and code in categories:
373-
return categories[code]
374-
375-
# Fallback: use uppercase first part of code
376-
if "." in code:
377-
return code.split(".")[0].upper()
378-
379-
return code
380-
381-
382164
def update_readme(
383165
args,
384166
section_title,

0 commit comments

Comments
 (0)