|
4 | 4 | from datetime import datetime, timezone |
5 | 5 |
|
6 | 6 | # Third-party |
| 7 | +import yaml |
7 | 8 | from git import InvalidGitRepositoryError, NoSuchPathError, Repo |
8 | 9 | from pandas import PeriodIndex |
9 | 10 |
|
@@ -161,6 +162,223 @@ def setup(current_file): |
161 | 162 | return logger, paths |
162 | 163 |
|
163 | 164 |
|
| 165 | +def get_arxiv_categories(): |
| 166 | + """Get comprehensive ArXiv category taxonomy mapping.""" |
| 167 | + # ArXiv's comprehensive category mapping |
| 168 | + categories = { |
| 169 | + # Computer Science |
| 170 | + "cs.AI": "Artificial Intelligence", |
| 171 | + "cs.AR": "Hardware Architecture", |
| 172 | + "cs.CC": "Computational Complexity", |
| 173 | + "cs.CE": "Computational Engineering, Finance, and Science", |
| 174 | + "cs.CG": "Computational Geometry", |
| 175 | + "cs.CL": "Computation and Language", |
| 176 | + "cs.CR": "Cryptography and Security", |
| 177 | + "cs.CV": "Computer Vision and Pattern Recognition", |
| 178 | + "cs.CY": "Computers and Society", |
| 179 | + "cs.DB": "Databases", |
| 180 | + "cs.DC": "Distributed, Parallel, and Cluster Computing", |
| 181 | + "cs.DL": "Digital Libraries", |
| 182 | + "cs.DM": "Discrete Mathematics", |
| 183 | + "cs.DS": "Data Structures and Algorithms", |
| 184 | + "cs.ET": "Emerging Technologies", |
| 185 | + "cs.FL": "Formal Languages and Automata Theory", |
| 186 | + "cs.GL": "General Literature", |
| 187 | + "cs.GR": "Graphics", |
| 188 | + "cs.GT": "Computer Science and Game Theory", |
| 189 | + "cs.HC": "Human-Computer Interaction", |
| 190 | + "cs.IR": "Information Retrieval", |
| 191 | + "cs.IT": "Information Theory", |
| 192 | + "cs.LG": "Machine Learning", |
| 193 | + "cs.LO": "Logic in Computer Science", |
| 194 | + "cs.MA": "Multiagent Systems", |
| 195 | + "cs.MM": "Multimedia", |
| 196 | + "cs.MS": "Mathematical Software", |
| 197 | + "cs.NA": "Numerical Analysis", |
| 198 | + "cs.NE": "Neural and Evolutionary Computing", |
| 199 | + "cs.NI": "Networking and Internet Architecture", |
| 200 | + "cs.OH": "Other Computer Science", |
| 201 | + "cs.OS": "Operating Systems", |
| 202 | + "cs.PF": "Performance", |
| 203 | + "cs.PL": "Programming Languages", |
| 204 | + "cs.RO": "Robotics", |
| 205 | + "cs.SC": "Symbolic Computation", |
| 206 | + "cs.SD": "Sound", |
| 207 | + "cs.SE": "Software Engineering", |
| 208 | + "cs.SI": "Social and Information Networks", |
| 209 | + "cs.SY": "Systems and Control", |
| 210 | + # Mathematics |
| 211 | + "math.AC": "Commutative Algebra", |
| 212 | + "math.AG": "Algebraic Geometry", |
| 213 | + "math.AP": "Analysis of PDEs", |
| 214 | + "math.AT": "Algebraic Topology", |
| 215 | + "math.CA": "Classical Analysis and ODEs", |
| 216 | + "math.CO": "Combinatorics", |
| 217 | + "math.CT": "Category Theory", |
| 218 | + "math.CV": "Complex Variables", |
| 219 | + "math.DG": "Differential Geometry", |
| 220 | + "math.DS": "Dynamical Systems", |
| 221 | + "math.FA": "Functional Analysis", |
| 222 | + "math.GM": "General Mathematics", |
| 223 | + "math.GN": "General Topology", |
| 224 | + "math.GR": "Group Theory", |
| 225 | + "math.GT": "Geometric Topology", |
| 226 | + "math.HO": "History and Overview", |
| 227 | + "math.IT": "Information Theory", |
| 228 | + "math.KT": "K-Theory and Homology", |
| 229 | + "math.LO": "Logic", |
| 230 | + "math.MG": "Metric Geometry", |
| 231 | + "math.MP": "Mathematical Physics", |
| 232 | + "math.NA": "Numerical Analysis", |
| 233 | + "math.NT": "Number Theory", |
| 234 | + "math.OA": "Operator Algebras", |
| 235 | + "math.OC": "Optimization and Control", |
| 236 | + "math.PR": "Probability", |
| 237 | + "math.QA": "Quantum Algebra", |
| 238 | + "math.RA": "Rings and Algebras", |
| 239 | + "math.RT": "Representation Theory", |
| 240 | + "math.SG": "Symplectic Geometry", |
| 241 | + "math.SP": "Spectral Theory", |
| 242 | + "math.ST": "Statistics Theory", |
| 243 | + # Physics |
| 244 | + "physics.acc-ph": "Accelerator Physics", |
| 245 | + "physics.ao-ph": "Atmospheric and Oceanic Physics", |
| 246 | + "physics.app-ph": "Applied Physics", |
| 247 | + "physics.atm-clus": "Atomic and Molecular Clusters", |
| 248 | + "physics.atom-ph": "Atomic Physics", |
| 249 | + "physics.bio-ph": "Biological Physics", |
| 250 | + "physics.chem-ph": "Chemical Physics", |
| 251 | + "physics.class-ph": "Classical Physics", |
| 252 | + "physics.comp-ph": "Computational Physics", |
| 253 | + "physics.data-an": "Data Analysis, Statistics and Probability", |
| 254 | + "physics.ed-ph": "Physics Education", |
| 255 | + "physics.flu-dyn": "Fluid Dynamics", |
| 256 | + "physics.gen-ph": "General Physics", |
| 257 | + "physics.geo-ph": "Geophysics", |
| 258 | + "physics.hist-ph": "History and Philosophy of Physics", |
| 259 | + "physics.ins-det": "Instrumentation and Detectors", |
| 260 | + "physics.med-ph": "Medical Physics", |
| 261 | + "physics.optics": "Optics", |
| 262 | + "physics.plasm-ph": "Plasma Physics", |
| 263 | + "physics.pop-ph": "Popular Physics", |
| 264 | + "physics.soc-ph": "Physics and Society", |
| 265 | + "physics.space-ph": "Space Physics", |
| 266 | + # Statistics |
| 267 | + "stat.AP": "Applications", |
| 268 | + "stat.CO": "Computation", |
| 269 | + "stat.ME": "Methodology", |
| 270 | + "stat.ML": "Machine Learning", |
| 271 | + "stat.OT": "Other Statistics", |
| 272 | + "stat.TH": "Statistics Theory", |
| 273 | + # Quantitative Biology |
| 274 | + "q-bio.BM": "Biomolecules", |
| 275 | + "q-bio.CB": "Cell Behavior", |
| 276 | + "q-bio.GN": "Genomics", |
| 277 | + "q-bio.MN": "Molecular Networks", |
| 278 | + "q-bio.NC": "Neurons and Cognition", |
| 279 | + "q-bio.OT": "Other Quantitative Biology", |
| 280 | + "q-bio.PE": "Populations and Evolution", |
| 281 | + "q-bio.QM": "Quantitative Methods", |
| 282 | + "q-bio.SC": "Subcellular Processes", |
| 283 | + "q-bio.TO": "Tissues and Organs", |
| 284 | + # Economics |
| 285 | + "econ.EM": "Econometrics", |
| 286 | + "econ.GN": "General Economics", |
| 287 | + "econ.TH": "Theoretical Economics", |
| 288 | + # Electrical Engineering |
| 289 | + "eess.AS": "Audio and Speech Processing", |
| 290 | + "eess.IV": "Image and Video Processing", |
| 291 | + "eess.SP": "Signal Processing", |
| 292 | + "eess.SY": "Systems and Control", |
| 293 | + # High Energy Physics |
| 294 | + "hep-ex": "High Energy Physics - Experiment", |
| 295 | + "hep-lat": "High Energy Physics - Lattice", |
| 296 | + "hep-ph": "High Energy Physics - Phenomenology", |
| 297 | + "hep-th": "High Energy Physics - Theory", |
| 298 | + # Other Physics |
| 299 | + "astro-ph": "Astrophysics", |
| 300 | + "astro-ph.CO": "Cosmology and Nongalactic Astrophysics", |
| 301 | + "astro-ph.EP": "Earth and Planetary Astrophysics", |
| 302 | + "astro-ph.GA": "Astrophysics of Galaxies", |
| 303 | + "astro-ph.HE": "High Energy Astrophysical Phenomena", |
| 304 | + "astro-ph.IM": "Instrumentation and Methods for Astrophysics", |
| 305 | + "astro-ph.SR": "Solar and Stellar Astrophysics", |
| 306 | + "cond-mat.dis-nn": "Disordered Systems and Neural Networks", |
| 307 | + "cond-mat.mes-hall": "Mesoscale and Nanoscale Physics", |
| 308 | + "cond-mat.mtrl-sci": "Materials Science", |
| 309 | + "cond-mat.other": "Other Condensed Matter", |
| 310 | + "cond-mat.quant-gas": "Quantum Gases", |
| 311 | + "cond-mat.soft": "Soft Condensed Matter", |
| 312 | + "cond-mat.stat-mech": "Statistical Mechanics", |
| 313 | + "cond-mat.str-el": "Strongly Correlated Electrons", |
| 314 | + "cond-mat.supr-con": "Superconductivity", |
| 315 | + "gr-qc": "General Relativity and Quantum Cosmology", |
| 316 | + "nlin.AO": "Adaptation and Self-Organizing Systems", |
| 317 | + "nlin.CD": "Chaotic Dynamics", |
| 318 | + "nlin.CG": "Cellular Automata and Lattice Gases", |
| 319 | + "nlin.PS": "Pattern Formation and Solitons", |
| 320 | + "nlin.SI": "Exactly Solvable and Integrable Systems", |
| 321 | + "nucl-ex": "Nuclear Experiment", |
| 322 | + "nucl-th": "Nuclear Theory", |
| 323 | + "quant-ph": "Quantum Physics", |
| 324 | + } |
| 325 | + |
| 326 | + return categories |
| 327 | + |
| 328 | + |
| 329 | +def load_arxiv_categories(data_dir=None): |
| 330 | + """Load ArXiv category mappings with fallback to comprehensive mapping.""" |
| 331 | + categories = {} |
| 332 | + |
| 333 | + # Try loading from YAML file first |
| 334 | + if data_dir: |
| 335 | + yaml_path = os.path.join(data_dir, "arxiv_category_map.yaml") |
| 336 | + if os.path.exists(yaml_path): |
| 337 | + try: |
| 338 | + with open(yaml_path, "r", encoding="utf-8") as f: |
| 339 | + categories = yaml.safe_load(f) or {} |
| 340 | + logging.info( |
| 341 | + f"Loaded {len(categories)} categories from {yaml_path}" |
| 342 | + ) |
| 343 | + except (yaml.YAMLError, IOError, OSError) as e: |
| 344 | + logging.warning(f"Failed to load category YAML: {e}") |
| 345 | + |
| 346 | + # Fallback to comprehensive mapping if no local categories |
| 347 | + if not categories: |
| 348 | + categories = get_arxiv_categories() |
| 349 | + if categories and data_dir: |
| 350 | + # Save fetched categories for future use |
| 351 | + try: |
| 352 | + os.makedirs(data_dir, exist_ok=True) |
| 353 | + yaml_path = os.path.join(data_dir, "arxiv_category_map.yaml") |
| 354 | + with open(yaml_path, "w", encoding="utf-8") as f: |
| 355 | + yaml.dump( |
| 356 | + categories, f, default_flow_style=False, sort_keys=True |
| 357 | + ) |
| 358 | + logging.info( |
| 359 | + f"Saved {len(categories)} categories to {yaml_path}" |
| 360 | + ) |
| 361 | + except (yaml.YAMLError, IOError, OSError) as e: |
| 362 | + logging.warning(f"Failed to save categories: {e}") |
| 363 | + |
| 364 | + return categories |
| 365 | + |
| 366 | + |
| 367 | +def normalize_arxiv_category(code, categories=None): |
| 368 | + """Convert category code to human-readable label.""" |
| 369 | + if not code or code == "Unknown": |
| 370 | + return code |
| 371 | + |
| 372 | + if categories and code in categories: |
| 373 | + return categories[code] |
| 374 | + |
| 375 | + # Fallback: use uppercase first part of code |
| 376 | + if "." in code: |
| 377 | + return code.split(".")[0].upper() |
| 378 | + |
| 379 | + return code |
| 380 | + |
| 381 | + |
164 | 382 | def update_readme( |
165 | 383 | args, |
166 | 384 | section_title, |
|
0 commit comments