|
41 | 41 | MAX_RESULTS_PER_QUERY = 500 # Maximum results to fetch per search query |
42 | 42 | DEFAULT_FETCH_LIMIT = 800 # Default total papers to fetch |
43 | 43 |
|
44 | | -# HTTP Retry Configuration (using shared constants where available) |
| 44 | +# HTTP Retry Configuration |
45 | 45 | RETRY_TOTAL = 5 |
46 | 46 | RETRY_BACKOFF_FACTOR = 1 |
47 | 47 |
|
48 | | - |
49 | 48 | # Search Queries |
50 | 49 | SEARCH_QUERIES = [ |
51 | 50 | 'all:"creative commons"', |
|
66 | 65 | 'all:"CC-0"', |
67 | 66 | ] |
68 | 67 |
|
69 | | -# File Paths |
70 | | -FILE_ARXIV_COUNT = shared.path_join(PATHS["data_1-fetch"], "arxiv_1_count.csv") |
71 | | -FILE_ARXIV_CATEGORY_REPORT = shared.path_join( |
72 | | - PATHS["data_1-fetch"], "arxiv_2_count_by_category_report.csv" |
73 | | -) |
74 | | -FILE_ARXIV_CATEGORY_REPORT_AGGREGATE = shared.path_join( |
75 | | - PATHS["data_1-fetch"], "arxiv_2_count_by_category_report_agg.csv" |
76 | | -) |
77 | | -FILE_ARXIV_YEAR = shared.path_join( |
78 | | - PATHS["data_1-fetch"], "arxiv_3_count_by_year.csv" |
79 | | -) |
80 | | -FILE_ARXIV_AUTHOR_BUCKET = shared.path_join( |
81 | | - PATHS["data_1-fetch"], "arxiv_4_count_by_author_bucket.csv" |
82 | | -) |
83 | | -# records metadata for each run for audit, reproducibility, and provenance |
84 | | -FILE_PROVENANCE = shared.path_join(PATHS["data"], "arxiv_provenance.yaml") |
85 | | - |
| 68 | +# CSV Headers |
86 | 69 | HEADER_COUNT = ["TOOL_IDENTIFIER", "COUNT"] |
87 | 70 | HEADER_CATEGORY_REPORT = [ |
88 | 71 | "TOOL_IDENTIFIER", |
|
94 | 77 | HEADER_YEAR = ["TOOL_IDENTIFIER", "YEAR", "COUNT"] |
95 | 78 | HEADER_AUTHOR_BUCKET = ["TOOL_IDENTIFIER", "AUTHOR_BUCKET", "COUNT"] |
96 | 79 |
|
97 | | -QUARTER = os.path.basename(PATHS["data_quarter"]) |
98 | | - |
99 | | -CATEGORY_LABELS = {} |
100 | | - |
101 | 80 | # Compiled regex patterns for CC license detection |
102 | 81 | CC_PATTERNS = [ |
103 | 82 | (re.compile(r"\bCC[-\s]?0\b", re.IGNORECASE), "CC0"), |
|
119 | 98 | ), |
120 | 99 | ] |
121 | 100 |
|
| 101 | +# ArXiv Categories - manually curated from ArXiv official taxonomy |
| 102 | +# Source: https://arxiv.org/category_taxonomy |
| 103 | +CATEGORIES = { |
| 104 | + # Computer Science |
| 105 | + "cs.AI": "Artificial Intelligence", |
| 106 | + "cs.AR": "Hardware Architecture", |
| 107 | + "cs.CC": "Computational Complexity", |
| 108 | + "cs.CE": "Computational Engineering, Finance, and Science", |
| 109 | + "cs.CG": "Computational Geometry", |
| 110 | + "cs.CL": "Computation and Language", |
| 111 | + "cs.CR": "Cryptography and Security", |
| 112 | + "cs.CV": "Computer Vision and Pattern Recognition", |
| 113 | + "cs.CY": "Computers and Society", |
| 114 | + "cs.DB": "Databases", |
| 115 | + "cs.DC": "Distributed, Parallel, and Cluster Computing", |
| 116 | + "cs.DL": "Digital Libraries", |
| 117 | + "cs.DM": "Discrete Mathematics", |
| 118 | + "cs.DS": "Data Structures and Algorithms", |
| 119 | + "cs.ET": "Emerging Technologies", |
| 120 | + "cs.FL": "Formal Languages and Automata Theory", |
| 121 | + "cs.GL": "General Literature", |
| 122 | + "cs.GR": "Graphics", |
| 123 | + "cs.GT": "Computer Science and Game Theory", |
| 124 | + "cs.HC": "Human-Computer Interaction", |
| 125 | + "cs.IR": "Information Retrieval", |
| 126 | + "cs.IT": "Information Theory", |
| 127 | + "cs.LG": "Machine Learning", |
| 128 | + "cs.LO": "Logic in Computer Science", |
| 129 | + "cs.MA": "Multiagent Systems", |
| 130 | + "cs.MM": "Multimedia", |
| 131 | + "cs.MS": "Mathematical Software", |
| 132 | + "cs.NA": "Numerical Analysis", |
| 133 | + "cs.NE": "Neural and Evolutionary Computing", |
| 134 | + "cs.NI": "Networking and Internet Architecture", |
| 135 | + "cs.OH": "Other Computer Science", |
| 136 | + "cs.OS": "Operating Systems", |
| 137 | + "cs.PF": "Performance", |
| 138 | + "cs.PL": "Programming Languages", |
| 139 | + "cs.RO": "Robotics", |
| 140 | + "cs.SC": "Symbolic Computation", |
| 141 | + "cs.SD": "Sound", |
| 142 | + "cs.SE": "Software Engineering", |
| 143 | + "cs.SI": "Social and Information Networks", |
| 144 | + "cs.SY": "Systems and Control", |
| 145 | + # Mathematics |
| 146 | + "math.AC": "Commutative Algebra", |
| 147 | + "math.AG": "Algebraic Geometry", |
| 148 | + "math.AP": "Analysis of PDEs", |
| 149 | + "math.AT": "Algebraic Topology", |
| 150 | + "math.CA": "Classical Analysis and ODEs", |
| 151 | + "math.CO": "Combinatorics", |
| 152 | + "math.CT": "Category Theory", |
| 153 | + "math.CV": "Complex Variables", |
| 154 | + "math.DG": "Differential Geometry", |
| 155 | + "math.DS": "Dynamical Systems", |
| 156 | + "math.FA": "Functional Analysis", |
| 157 | + "math.GM": "General Mathematics", |
| 158 | + "math.GN": "General Topology", |
| 159 | + "math.GR": "Group Theory", |
| 160 | + "math.GT": "Geometric Topology", |
| 161 | + "math.HO": "History and Overview", |
| 162 | + "math.IT": "Information Theory", |
| 163 | + "math.KT": "K-Theory and Homology", |
| 164 | + "math.LO": "Logic", |
| 165 | + "math.MG": "Metric Geometry", |
| 166 | + "math.MP": "Mathematical Physics", |
| 167 | + "math.NA": "Numerical Analysis", |
| 168 | + "math.NT": "Number Theory", |
| 169 | + "math.OA": "Operator Algebras", |
| 170 | + "math.OC": "Optimization and Control", |
| 171 | + "math.PR": "Probability", |
| 172 | + "math.QA": "Quantum Algebra", |
| 173 | + "math.RA": "Rings and Algebras", |
| 174 | + "math.RT": "Representation Theory", |
| 175 | + "math.SG": "Symplectic Geometry", |
| 176 | + "math.SP": "Spectral Theory", |
| 177 | + "math.ST": "Statistics Theory", |
| 178 | + # Physics |
| 179 | + "physics.acc-ph": "Accelerator Physics", |
| 180 | + "physics.ao-ph": "Atmospheric and Oceanic Physics", |
| 181 | + "physics.app-ph": "Applied Physics", |
| 182 | + "physics.atm-clus": "Atomic and Molecular Clusters", |
| 183 | + "physics.atom-ph": "Atomic Physics", |
| 184 | + "physics.bio-ph": "Biological Physics", |
| 185 | + "physics.chem-ph": "Chemical Physics", |
| 186 | + "physics.class-ph": "Classical Physics", |
| 187 | + "physics.comp-ph": "Computational Physics", |
| 188 | + "physics.data-an": "Data Analysis, Statistics and Probability", |
| 189 | + "physics.ed-ph": "Physics Education", |
| 190 | + "physics.flu-dyn": "Fluid Dynamics", |
| 191 | + "physics.gen-ph": "General Physics", |
| 192 | + "physics.geo-ph": "Geophysics", |
| 193 | + "physics.hist-ph": "History and Philosophy of Physics", |
| 194 | + "physics.ins-det": "Instrumentation and Detectors", |
| 195 | + "physics.med-ph": "Medical Physics", |
| 196 | + "physics.optics": "Optics", |
| 197 | + "physics.plasm-ph": "Plasma Physics", |
| 198 | + "physics.pop-ph": "Popular Physics", |
| 199 | + "physics.soc-ph": "Physics and Society", |
| 200 | + "physics.space-ph": "Space Physics", |
| 201 | + # Statistics |
| 202 | + "stat.AP": "Applications", |
| 203 | + "stat.CO": "Computation", |
| 204 | + "stat.ME": "Methodology", |
| 205 | + "stat.ML": "Machine Learning", |
| 206 | + "stat.OT": "Other Statistics", |
| 207 | + "stat.TH": "Statistics Theory", |
| 208 | + # Quantitative Biology |
| 209 | + "q-bio.BM": "Biomolecules", |
| 210 | + "q-bio.CB": "Cell Behavior", |
| 211 | + "q-bio.GN": "Genomics", |
| 212 | + "q-bio.MN": "Molecular Networks", |
| 213 | + "q-bio.NC": "Neurons and Cognition", |
| 214 | + "q-bio.OT": "Other Quantitative Biology", |
| 215 | + "q-bio.PE": "Populations and Evolution", |
| 216 | + "q-bio.QM": "Quantitative Methods", |
| 217 | + "q-bio.SC": "Subcellular Processes", |
| 218 | + "q-bio.TO": "Tissues and Organs", |
| 219 | + # Economics |
| 220 | + "econ.EM": "Econometrics", |
| 221 | + "econ.GN": "General Economics", |
| 222 | + "econ.TH": "Theoretical Economics", |
| 223 | + # Electrical Engineering |
| 224 | + "eess.AS": "Audio and Speech Processing", |
| 225 | + "eess.IV": "Image and Video Processing", |
| 226 | + "eess.SP": "Signal Processing", |
| 227 | + "eess.SY": "Systems and Control", |
| 228 | + # High Energy Physics |
| 229 | + "hep-ex": "High Energy Physics - Experiment", |
| 230 | + "hep-lat": "High Energy Physics - Lattice", |
| 231 | + "hep-ph": "High Energy Physics - Phenomenology", |
| 232 | + "hep-th": "High Energy Physics - Theory", |
| 233 | + # Other Physics |
| 234 | + "astro-ph": "Astrophysics", |
| 235 | + "astro-ph.CO": "Cosmology and Nongalactic Astrophysics", |
| 236 | + "astro-ph.EP": "Earth and Planetary Astrophysics", |
| 237 | + "astro-ph.GA": "Astrophysics of Galaxies", |
| 238 | + "astro-ph.HE": "High Energy Astrophysical Phenomena", |
| 239 | + "astro-ph.IM": "Instrumentation and Methods for Astrophysics", |
| 240 | + "astro-ph.SR": "Solar and Stellar Astrophysics", |
| 241 | + "cond-mat.dis-nn": "Disordered Systems and Neural Networks", |
| 242 | + "cond-mat.mes-hall": "Mesoscale and Nanoscale Physics", |
| 243 | + "cond-mat.mtrl-sci": "Materials Science", |
| 244 | + "cond-mat.other": "Other Condensed Matter", |
| 245 | + "cond-mat.quant-gas": "Quantum Gases", |
| 246 | + "cond-mat.soft": "Soft Condensed Matter", |
| 247 | + "cond-mat.stat-mech": "Statistical Mechanics", |
| 248 | + "cond-mat.str-el": "Strongly Correlated Electrons", |
| 249 | + "cond-mat.supr-con": "Superconductivity", |
| 250 | + "gr-qc": "General Relativity and Quantum Cosmology", |
| 251 | + "nlin.AO": "Adaptation and Self-Organizing Systems", |
| 252 | + "nlin.CD": "Chaotic Dynamics", |
| 253 | + "nlin.CG": "Cellular Automata and Lattice Gases", |
| 254 | + "nlin.PS": "Pattern Formation and Solitons", |
| 255 | + "nlin.SI": "Exactly Solvable and Integrable Systems", |
| 256 | + "nucl-ex": "Nuclear Experiment", |
| 257 | + "nucl-th": "Nuclear Theory", |
| 258 | + "quant-ph": "Quantum Physics", |
| 259 | +} |
| 260 | + |
| 261 | +# File Paths |
| 262 | +FILE_ARXIV_COUNT = shared.path_join(PATHS["data_1-fetch"], "arxiv_1_count.csv") |
| 263 | +FILE_ARXIV_CATEGORY_REPORT = shared.path_join( |
| 264 | + PATHS["data_1-fetch"], "arxiv_2_count_by_category_report.csv" |
| 265 | +) |
| 266 | +FILE_ARXIV_CATEGORY_REPORT_AGGREGATE = shared.path_join( |
| 267 | + PATHS["data_1-fetch"], "arxiv_2_count_by_category_report_agg.csv" |
| 268 | +) |
| 269 | +FILE_ARXIV_YEAR = shared.path_join( |
| 270 | + PATHS["data_1-fetch"], "arxiv_3_count_by_year.csv" |
| 271 | +) |
| 272 | +FILE_ARXIV_AUTHOR_BUCKET = shared.path_join( |
| 273 | + PATHS["data_1-fetch"], "arxiv_4_count_by_author_bucket.csv" |
| 274 | +) |
| 275 | +# records metadata for each run for audit, reproducibility, and provenance |
| 276 | +FILE_PROVENANCE = shared.path_join(PATHS["data"], "arxiv_provenance.yaml") |
| 277 | + |
| 278 | +# Runtime variables |
| 279 | +QUARTER = os.path.basename(PATHS["data_quarter"]) |
| 280 | + |
122 | 281 | # Log the start of the script execution |
123 | 282 | LOGGER.info("Script execution started.") |
124 | 283 |
|
@@ -313,7 +472,7 @@ def save_count_data( |
313 | 472 | for lic, cats in category_counts.items(): |
314 | 473 | total_for_license = sum(cats.values()) or 1 |
315 | 474 | for code, c in cats.items(): |
316 | | - label = shared.normalize_arxiv_category(code, CATEGORY_LABELS) |
| 475 | + label = shared.normalize_arxiv_category(code, CATEGORIES) |
317 | 476 | pct = round((c / total_for_license) * 100, 2) |
318 | 477 | writer.writerow( |
319 | 478 | { |
@@ -351,7 +510,7 @@ def save_count_data( |
351 | 510 | others = sorted_cats[TOP_N:] |
352 | 511 | other_count = sum(c for _, c in others) |
353 | 512 | for code, c in top: |
354 | | - label = shared.normalize_arxiv_category(code, CATEGORY_LABELS) |
| 513 | + label = shared.normalize_arxiv_category(code, CATEGORIES) |
355 | 514 | writer.writerow( |
356 | 515 | { |
357 | 516 | "TOOL_IDENTIFIER": lic, |
@@ -413,9 +572,6 @@ def query_arxiv(args): |
413 | 572 | LOGGER.info("Beginning to fetch results from ArXiv API") |
414 | 573 | session = get_requests_session() |
415 | 574 |
|
416 | | - # Load category mappings using shared function |
417 | | - CATEGORY_LABELS.update(shared.load_arxiv_categories(PATHS.get("data"))) |
418 | | - |
419 | 575 | results_per_iteration = RESULTS_PER_REQUEST |
420 | 576 |
|
421 | 577 | search_queries = SEARCH_QUERIES |
|
0 commit comments