Skip to content

Commit b2f96f9

Browse files
committed
Refactor arxiv_fetch.py: move CATEGORIES constant local, reorganize constants, remove redundancies
1 parent 2ef6c6f commit b2f96f9

1 file changed

Lines changed: 184 additions & 28 deletions

File tree

scripts/1-fetch/arxiv_fetch.py

Lines changed: 184 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -41,11 +41,10 @@
4141
MAX_RESULTS_PER_QUERY = 500 # Maximum results to fetch per search query
4242
DEFAULT_FETCH_LIMIT = 800 # Default total papers to fetch
4343

44-
# HTTP Retry Configuration (using shared constants where available)
44+
# HTTP Retry Configuration
4545
RETRY_TOTAL = 5
4646
RETRY_BACKOFF_FACTOR = 1
4747

48-
4948
# Search Queries
5049
SEARCH_QUERIES = [
5150
'all:"creative commons"',
@@ -66,23 +65,7 @@
6665
'all:"CC-0"',
6766
]
6867

69-
# File Paths
70-
FILE_ARXIV_COUNT = shared.path_join(PATHS["data_1-fetch"], "arxiv_1_count.csv")
71-
FILE_ARXIV_CATEGORY_REPORT = shared.path_join(
72-
PATHS["data_1-fetch"], "arxiv_2_count_by_category_report.csv"
73-
)
74-
FILE_ARXIV_CATEGORY_REPORT_AGGREGATE = shared.path_join(
75-
PATHS["data_1-fetch"], "arxiv_2_count_by_category_report_agg.csv"
76-
)
77-
FILE_ARXIV_YEAR = shared.path_join(
78-
PATHS["data_1-fetch"], "arxiv_3_count_by_year.csv"
79-
)
80-
FILE_ARXIV_AUTHOR_BUCKET = shared.path_join(
81-
PATHS["data_1-fetch"], "arxiv_4_count_by_author_bucket.csv"
82-
)
83-
# records metadata for each run for audit, reproducibility, and provenance
84-
FILE_PROVENANCE = shared.path_join(PATHS["data"], "arxiv_provenance.yaml")
85-
68+
# CSV Headers
8669
HEADER_COUNT = ["TOOL_IDENTIFIER", "COUNT"]
8770
HEADER_CATEGORY_REPORT = [
8871
"TOOL_IDENTIFIER",
@@ -94,10 +77,6 @@
9477
HEADER_YEAR = ["TOOL_IDENTIFIER", "YEAR", "COUNT"]
9578
HEADER_AUTHOR_BUCKET = ["TOOL_IDENTIFIER", "AUTHOR_BUCKET", "COUNT"]
9679

97-
QUARTER = os.path.basename(PATHS["data_quarter"])
98-
99-
CATEGORY_LABELS = {}
100-
10180
# Compiled regex patterns for CC license detection
10281
CC_PATTERNS = [
10382
(re.compile(r"\bCC[-\s]?0\b", re.IGNORECASE), "CC0"),
@@ -119,6 +98,186 @@
11998
),
12099
]
121100

101+
# ArXiv Categories - manually curated from ArXiv official taxonomy
102+
# Source: https://arxiv.org/category_taxonomy
103+
CATEGORIES = {
104+
# Computer Science
105+
"cs.AI": "Artificial Intelligence",
106+
"cs.AR": "Hardware Architecture",
107+
"cs.CC": "Computational Complexity",
108+
"cs.CE": "Computational Engineering, Finance, and Science",
109+
"cs.CG": "Computational Geometry",
110+
"cs.CL": "Computation and Language",
111+
"cs.CR": "Cryptography and Security",
112+
"cs.CV": "Computer Vision and Pattern Recognition",
113+
"cs.CY": "Computers and Society",
114+
"cs.DB": "Databases",
115+
"cs.DC": "Distributed, Parallel, and Cluster Computing",
116+
"cs.DL": "Digital Libraries",
117+
"cs.DM": "Discrete Mathematics",
118+
"cs.DS": "Data Structures and Algorithms",
119+
"cs.ET": "Emerging Technologies",
120+
"cs.FL": "Formal Languages and Automata Theory",
121+
"cs.GL": "General Literature",
122+
"cs.GR": "Graphics",
123+
"cs.GT": "Computer Science and Game Theory",
124+
"cs.HC": "Human-Computer Interaction",
125+
"cs.IR": "Information Retrieval",
126+
"cs.IT": "Information Theory",
127+
"cs.LG": "Machine Learning",
128+
"cs.LO": "Logic in Computer Science",
129+
"cs.MA": "Multiagent Systems",
130+
"cs.MM": "Multimedia",
131+
"cs.MS": "Mathematical Software",
132+
"cs.NA": "Numerical Analysis",
133+
"cs.NE": "Neural and Evolutionary Computing",
134+
"cs.NI": "Networking and Internet Architecture",
135+
"cs.OH": "Other Computer Science",
136+
"cs.OS": "Operating Systems",
137+
"cs.PF": "Performance",
138+
"cs.PL": "Programming Languages",
139+
"cs.RO": "Robotics",
140+
"cs.SC": "Symbolic Computation",
141+
"cs.SD": "Sound",
142+
"cs.SE": "Software Engineering",
143+
"cs.SI": "Social and Information Networks",
144+
"cs.SY": "Systems and Control",
145+
# Mathematics
146+
"math.AC": "Commutative Algebra",
147+
"math.AG": "Algebraic Geometry",
148+
"math.AP": "Analysis of PDEs",
149+
"math.AT": "Algebraic Topology",
150+
"math.CA": "Classical Analysis and ODEs",
151+
"math.CO": "Combinatorics",
152+
"math.CT": "Category Theory",
153+
"math.CV": "Complex Variables",
154+
"math.DG": "Differential Geometry",
155+
"math.DS": "Dynamical Systems",
156+
"math.FA": "Functional Analysis",
157+
"math.GM": "General Mathematics",
158+
"math.GN": "General Topology",
159+
"math.GR": "Group Theory",
160+
"math.GT": "Geometric Topology",
161+
"math.HO": "History and Overview",
162+
"math.IT": "Information Theory",
163+
"math.KT": "K-Theory and Homology",
164+
"math.LO": "Logic",
165+
"math.MG": "Metric Geometry",
166+
"math.MP": "Mathematical Physics",
167+
"math.NA": "Numerical Analysis",
168+
"math.NT": "Number Theory",
169+
"math.OA": "Operator Algebras",
170+
"math.OC": "Optimization and Control",
171+
"math.PR": "Probability",
172+
"math.QA": "Quantum Algebra",
173+
"math.RA": "Rings and Algebras",
174+
"math.RT": "Representation Theory",
175+
"math.SG": "Symplectic Geometry",
176+
"math.SP": "Spectral Theory",
177+
"math.ST": "Statistics Theory",
178+
# Physics
179+
"physics.acc-ph": "Accelerator Physics",
180+
"physics.ao-ph": "Atmospheric and Oceanic Physics",
181+
"physics.app-ph": "Applied Physics",
182+
"physics.atm-clus": "Atomic and Molecular Clusters",
183+
"physics.atom-ph": "Atomic Physics",
184+
"physics.bio-ph": "Biological Physics",
185+
"physics.chem-ph": "Chemical Physics",
186+
"physics.class-ph": "Classical Physics",
187+
"physics.comp-ph": "Computational Physics",
188+
"physics.data-an": "Data Analysis, Statistics and Probability",
189+
"physics.ed-ph": "Physics Education",
190+
"physics.flu-dyn": "Fluid Dynamics",
191+
"physics.gen-ph": "General Physics",
192+
"physics.geo-ph": "Geophysics",
193+
"physics.hist-ph": "History and Philosophy of Physics",
194+
"physics.ins-det": "Instrumentation and Detectors",
195+
"physics.med-ph": "Medical Physics",
196+
"physics.optics": "Optics",
197+
"physics.plasm-ph": "Plasma Physics",
198+
"physics.pop-ph": "Popular Physics",
199+
"physics.soc-ph": "Physics and Society",
200+
"physics.space-ph": "Space Physics",
201+
# Statistics
202+
"stat.AP": "Applications",
203+
"stat.CO": "Computation",
204+
"stat.ME": "Methodology",
205+
"stat.ML": "Machine Learning",
206+
"stat.OT": "Other Statistics",
207+
"stat.TH": "Statistics Theory",
208+
# Quantitative Biology
209+
"q-bio.BM": "Biomolecules",
210+
"q-bio.CB": "Cell Behavior",
211+
"q-bio.GN": "Genomics",
212+
"q-bio.MN": "Molecular Networks",
213+
"q-bio.NC": "Neurons and Cognition",
214+
"q-bio.OT": "Other Quantitative Biology",
215+
"q-bio.PE": "Populations and Evolution",
216+
"q-bio.QM": "Quantitative Methods",
217+
"q-bio.SC": "Subcellular Processes",
218+
"q-bio.TO": "Tissues and Organs",
219+
# Economics
220+
"econ.EM": "Econometrics",
221+
"econ.GN": "General Economics",
222+
"econ.TH": "Theoretical Economics",
223+
# Electrical Engineering
224+
"eess.AS": "Audio and Speech Processing",
225+
"eess.IV": "Image and Video Processing",
226+
"eess.SP": "Signal Processing",
227+
"eess.SY": "Systems and Control",
228+
# High Energy Physics
229+
"hep-ex": "High Energy Physics - Experiment",
230+
"hep-lat": "High Energy Physics - Lattice",
231+
"hep-ph": "High Energy Physics - Phenomenology",
232+
"hep-th": "High Energy Physics - Theory",
233+
# Other Physics
234+
"astro-ph": "Astrophysics",
235+
"astro-ph.CO": "Cosmology and Nongalactic Astrophysics",
236+
"astro-ph.EP": "Earth and Planetary Astrophysics",
237+
"astro-ph.GA": "Astrophysics of Galaxies",
238+
"astro-ph.HE": "High Energy Astrophysical Phenomena",
239+
"astro-ph.IM": "Instrumentation and Methods for Astrophysics",
240+
"astro-ph.SR": "Solar and Stellar Astrophysics",
241+
"cond-mat.dis-nn": "Disordered Systems and Neural Networks",
242+
"cond-mat.mes-hall": "Mesoscale and Nanoscale Physics",
243+
"cond-mat.mtrl-sci": "Materials Science",
244+
"cond-mat.other": "Other Condensed Matter",
245+
"cond-mat.quant-gas": "Quantum Gases",
246+
"cond-mat.soft": "Soft Condensed Matter",
247+
"cond-mat.stat-mech": "Statistical Mechanics",
248+
"cond-mat.str-el": "Strongly Correlated Electrons",
249+
"cond-mat.supr-con": "Superconductivity",
250+
"gr-qc": "General Relativity and Quantum Cosmology",
251+
"nlin.AO": "Adaptation and Self-Organizing Systems",
252+
"nlin.CD": "Chaotic Dynamics",
253+
"nlin.CG": "Cellular Automata and Lattice Gases",
254+
"nlin.PS": "Pattern Formation and Solitons",
255+
"nlin.SI": "Exactly Solvable and Integrable Systems",
256+
"nucl-ex": "Nuclear Experiment",
257+
"nucl-th": "Nuclear Theory",
258+
"quant-ph": "Quantum Physics",
259+
}
260+
261+
# File Paths
262+
FILE_ARXIV_COUNT = shared.path_join(PATHS["data_1-fetch"], "arxiv_1_count.csv")
263+
FILE_ARXIV_CATEGORY_REPORT = shared.path_join(
264+
PATHS["data_1-fetch"], "arxiv_2_count_by_category_report.csv"
265+
)
266+
FILE_ARXIV_CATEGORY_REPORT_AGGREGATE = shared.path_join(
267+
PATHS["data_1-fetch"], "arxiv_2_count_by_category_report_agg.csv"
268+
)
269+
FILE_ARXIV_YEAR = shared.path_join(
270+
PATHS["data_1-fetch"], "arxiv_3_count_by_year.csv"
271+
)
272+
FILE_ARXIV_AUTHOR_BUCKET = shared.path_join(
273+
PATHS["data_1-fetch"], "arxiv_4_count_by_author_bucket.csv"
274+
)
275+
# records metadata for each run for audit, reproducibility, and provenance
276+
FILE_PROVENANCE = shared.path_join(PATHS["data"], "arxiv_provenance.yaml")
277+
278+
# Runtime variables
279+
QUARTER = os.path.basename(PATHS["data_quarter"])
280+
122281
# Log the start of the script execution
123282
LOGGER.info("Script execution started.")
124283

@@ -313,7 +472,7 @@ def save_count_data(
313472
for lic, cats in category_counts.items():
314473
total_for_license = sum(cats.values()) or 1
315474
for code, c in cats.items():
316-
label = shared.normalize_arxiv_category(code, CATEGORY_LABELS)
475+
label = shared.normalize_arxiv_category(code, CATEGORIES)
317476
pct = round((c / total_for_license) * 100, 2)
318477
writer.writerow(
319478
{
@@ -351,7 +510,7 @@ def save_count_data(
351510
others = sorted_cats[TOP_N:]
352511
other_count = sum(c for _, c in others)
353512
for code, c in top:
354-
label = shared.normalize_arxiv_category(code, CATEGORY_LABELS)
513+
label = shared.normalize_arxiv_category(code, CATEGORIES)
355514
writer.writerow(
356515
{
357516
"TOOL_IDENTIFIER": lic,
@@ -413,9 +572,6 @@ def query_arxiv(args):
413572
LOGGER.info("Beginning to fetch results from ArXiv API")
414573
session = get_requests_session()
415574

416-
# Load category mappings using shared function
417-
CATEGORY_LABELS.update(shared.load_arxiv_categories(PATHS.get("data")))
418-
419575
results_per_iteration = RESULTS_PER_REQUEST
420576

421577
search_queries = SEARCH_QUERIES

0 commit comments

Comments
 (0)