|
13 | 13 | import traceback |
14 | 14 | from collections import Counter, defaultdict |
15 | 15 | from copy import copy |
16 | | -from datetime import datetime, timezone |
17 | 16 | from operator import itemgetter |
18 | 17 |
|
19 | 18 | # Third-party |
|
37 | 36 | BASE_URL = "https://oaipmh.arxiv.org/oai" |
38 | 37 | # Defaults should result in quick operation (not complete operation) |
39 | 38 | DEFAULT_FETCH_LIMIT = 4500 # Fetch 3 batches of 1,500 articles each |
40 | | -DEFAULT_YEARS_BACK = 5 |
41 | 39 | # CSV file paths |
42 | 40 | FILE_ARXIV_AUTHOR_BUCKET = shared.path_join( |
43 | 41 | PATHS["data_1-fetch"], "arxiv_4_count_by_author_bucket.csv" |
@@ -91,40 +89,13 @@ def parse_arguments(): |
91 | 89 | default=DEFAULT_FETCH_LIMIT, |
92 | 90 | help=( |
93 | 91 | "Limit number of fetched articles (default:" |
94 | | - f" {DEFAULT_FETCH_LIMIT}). Use a value of -1 to remove limit." |
| 92 | + f" {DEFAULT_FETCH_LIMIT}). Use a value of -1 to fetch all articles" |
| 93 | + " (remove limit)." |
95 | 94 | ), |
96 | 95 | ) |
97 | | - parser.add_argument( |
98 | | - "--years-back", |
99 | | - type=int, |
100 | | - default=DEFAULT_YEARS_BACK, |
101 | | - help=( |
102 | | - "Number of years back from current year to fetch (default:" |
103 | | - f" {DEFAULT_YEARS_BACK}). Use a value of -1 to specify 2008-02-05" |
104 | | - " (first date a CC licensed article was added)." |
105 | | - ), |
106 | | - ) |
107 | | - |
108 | 96 | args = parser.parse_args() |
109 | 97 | if not args.enable_save and args.enable_git: |
110 | 98 | parser.error("--enable-git requires --enable-save") |
111 | | - # Restrict args.years_back to earliest datetime and initialize |
112 | | - # args.from_date |
113 | | - # |
114 | | - # Survey of records indicated the first CC licenced article was added on |
115 | | - # 2008-02-05 |
116 | | - earliest_date = datetime(2008, 2, 5, tzinfo=timezone.utc) |
117 | | - this_year = datetime.now(timezone.utc).year |
118 | | - if args.years_back == -1: |
119 | | - arg_date = earliest_date |
120 | | - else: |
121 | | - start_year = this_year - args.years_back |
122 | | - arg_date = datetime(start_year, 1, 1, tzinfo=timezone.utc) |
123 | | - if arg_date < earliest_date: |
124 | | - arg_date = earliest_date |
125 | | - args.from_date = arg_date.strftime("%Y-%m-%d") |
126 | | - args.years_back = this_year - arg_date.year |
127 | | - |
128 | 99 | return args |
129 | 100 |
|
130 | 101 |
|
@@ -316,9 +287,12 @@ def query_arxiv(args, session): |
316 | 287 | """ |
317 | 288 | Query ArXiv OAI-PMH API and return information about CC licensed articles. |
318 | 289 | """ |
| 290 | + if args.limit == -1: |
| 291 | + count_desc = "all" |
| 292 | + else: |
| 293 | + count_desc = f"a maximum of {args.limit}" |
319 | 294 | LOGGER.info( |
320 | | - f"Querying articles from {args.from_date} onwards ({args.years_back}" |
321 | | - " years back)" |
| 295 | + f"Fetching {count_desc} articles starting form add date 2008-02-05" |
322 | 296 | ) |
323 | 297 |
|
324 | 298 | # Data structures for counting |
@@ -353,7 +327,7 @@ def query_arxiv(args, session): |
353 | 327 | params = { |
354 | 328 | "verb": "ListRecords", |
355 | 329 | "metadataPrefix": "arXiv", |
356 | | - "from": args.from_date, |
| 330 | + "from": "2008-02-05", # First addition of CC licensed articles |
357 | 331 | } |
358 | 332 | verb = "starting" |
359 | 333 |
|
@@ -542,12 +516,9 @@ def write_provence(args, cc_articles_found): |
542 | 516 | provenance_data = { |
543 | 517 | "api_description": desc, |
544 | 518 | "api_endpoint": BASE_URL, |
545 | | - "arguments": { |
546 | | - "from_date": args.from_date, |
547 | | - "limit": args.limit, |
548 | | - "years_back": args.years_back, |
549 | | - }, |
550 | 519 | "cc_articles_found": cc_articles_found, |
| 520 | + "fetch_limit": args.limit, |
| 521 | + "from_add_date": "2008-02-05", |
551 | 522 | "quarter": QUARTER, |
552 | 523 | "script": os.path.basename(__file__), |
553 | 524 | } |
|
0 commit comments