Skip to content

Commit 5348e8b

Browse files
committed
refactord to use url library, enhanced retry and extraction logic
1 parent f8c9774 commit 5348e8b

1 file changed

Lines changed: 13 additions & 11 deletions

File tree

scripts/1-fetch/arxiv_fetch.py

Lines changed: 13 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -203,12 +203,12 @@ def get_requests_session():
203203

204204
def query_arxiv(args):
205205
"""Query ArXiv API for papers with potential CC licenses."""
206-
206+
207207
LOGGER.info("Beginning to fetch results from ArXiv API")
208208

209209
session = get_requests_session()
210210
results_per_iteration = 50
211-
211+
212212
search_queries = [
213213
'all:"creative commons"',
214214
'all:"CC BY"',
@@ -246,7 +246,7 @@ def query_arxiv(args):
246246

247247
papers_found_in_batch = 0
248248

249-
try:
249+
try:
250250
LOGGER.info(
251251
f"Fetching results {start} - "
252252
f"{start + results_per_iteration}"
@@ -258,9 +258,9 @@ def query_arxiv(args):
258258
for entry in feed.entries:
259259
if total_fetched >= args.limit:
260260
break
261-
261+
262262
license_info = extract_license_info(entry)
263-
263+
264264
if license_info != "Unknown":
265265
category = extract_category_from_entry(entry)
266266
year = extract_year_from_entry(entry)
@@ -286,11 +286,12 @@ def query_arxiv(args):
286286
f"{category} - {year}"
287287
)
288288

289-
# arXiv recommends a 3-seconds delay between consecutive api calls for efficiency
289+
# arXiv recommends a 3-seconds delay between consecutive
290+
# api calls for efficiency
290291
time.sleep(3)
291-
except requests.RequestException as e:
292-
LOGGER.error(f"Request failed: {e}")
293-
break
292+
except requests.RequestException as e:
293+
LOGGER.error(f"Request failed: {e}")
294+
break
294295

295296
if papers_found_in_batch == 0:
296297
consecutive_empty_calls += 1
@@ -304,14 +305,15 @@ def query_arxiv(args):
304305
consecutive_empty_calls = 0
305306

306307
# Save results
307-
308+
308309
if args.enable_save:
309310
save_count_data(
310311
license_counts, category_counts, year_counts, author_counts
311312
)
312313

313314
LOGGER.info(f"Total CC licensed papers fetched: {total_fetched}")
314315

316+
315317
def save_count_data(
316318
license_counts, category_counts, year_counts, author_counts
317319
):
@@ -364,7 +366,7 @@ def save_count_data(
364366
"COUNT": count,
365367
}
366368
)
367-
369+
368370
# Save author count data
369371
with open(FILE_ARXIV_AUTHOR, "w", newline="") as file_obj:
370372
writer = csv.DictWriter(

0 commit comments

Comments
 (0)