|
3 | 3 | Fetch arXiv articles that use a CC legal tool using the OAI-PMH API. |
4 | 4 | OAI-PMH: Open Archives Initiative Protocol for Metadata Havesting. |
5 | 5 | """ |
| 6 | + |
6 | 7 | # Standard library |
7 | 8 | import argparse |
8 | 9 | import csv |
|
61 | 62 | HEADER_COUNT = ["TOOL_IDENTIFIER", "COUNT"] |
62 | 63 | HEADER_YEAR = ["TOOL_IDENTIFIER", "YEAR", "COUNT"] |
63 | 64 | QUARTER = os.path.basename(PATHS["data_quarter"]) |
| 65 | +SUBSUMED_CATEGORIES = { |
| 66 | + # https://arxiv.org/archive/alg-geom |
| 67 | + # "The alg-geom archive has been subsumed into Algebraic Geometry |
| 68 | + # (math.AG)." |
| 69 | + "alg-geom": "math.AG", |
| 70 | + # https://arxiv.org/archive/chao-dyn |
| 71 | + # "The chao-dyn archive has been subsumed into Chaotic Dynamics (nlin.CD)." |
| 72 | + "chao-dyn": "nlin.CD", |
| 73 | + # https://arxiv.org/archive/dg-ga |
| 74 | + # "The dg-ga archive has been subsumed into Differential Geometry |
| 75 | + # (math.DG)." |
| 76 | + "dg-ga": "math.DG", |
| 77 | + # https://arxiv.org/archive/solv-int |
| 78 | + # "The solv-int archive has been subsumed into Exactly Solvable and |
| 79 | + # Integrable Systems (nlin.SI)." |
| 80 | + "solv-int": "nlin.SI", |
| 81 | + # https://arxiv.org/archive/q-alg |
| 82 | + # "The q-alg archive has been subsumed into Quantum Algebra (math.QA)." |
| 83 | + "q-alg": "math.QA", |
| 84 | +} |
64 | 85 |
|
65 | 86 |
|
66 | 87 | # parsing arguments function |
@@ -247,6 +268,10 @@ def extract_record_metadata(args, record): |
247 | 268 | categories_elem = record.find(".//{http://arxiv.org/OAI/arXiv/}categories") |
248 | 269 | if categories_elem is not None and categories_elem.text: |
249 | 270 | metadata["categories"] = categories_elem.text.strip().split() |
| 271 | + for index, code in enumerate(metadata["categories"]): |
| 272 | + metadata["categories"][index] = SUBSUMED_CATEGORIES.get(code, code) |
| 273 | + metadata["categories"] = list(set(metadata["categories"])) |
| 274 | + metadata["categories"].sort() |
250 | 275 | else: |
251 | 276 | metadata["categories"] = False |
252 | 277 |
|
@@ -409,6 +434,8 @@ def query_arxiv(args, session): |
409 | 434 | cc_articles_found += 1 |
410 | 435 |
|
411 | 436 | if args.show_added and cc_articles_added: |
| 437 | + cc_articles_added = list(set(cc_articles_added)) |
| 438 | + cc_articles_added.sort() |
412 | 439 | LOGGER.info(f" CC articles added: {', '.join(cc_articles_added)}") |
413 | 440 |
|
414 | 441 | LOGGER.info( |
|
0 commit comments