Skip to content

Commit df6fe6b

Browse files
committed
Replace HTTP retry and API constants with literal values
1 parent 1880043 commit df6fe6b

1 file changed

Lines changed: 8 additions & 14 deletions

File tree

scripts/1-fetch/arxiv_fetch.py

Lines changed: 8 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -36,14 +36,9 @@
3636
# Constants
3737
# API Configuration
3838
BASE_URL = "http://export.arxiv.org/api/query?"
39-
API_DELAY_SECONDS = 3 # ArXiv recommended delay between API calls
40-
RESULTS_PER_REQUEST = 50 # Number of results per API request
41-
MAX_RESULTS_PER_QUERY = 500 # Maximum results to fetch per search query
4239
DEFAULT_FETCH_LIMIT = 800 # Default total papers to fetch
4340

44-
# HTTP Retry Configuration
45-
RETRY_TOTAL = 5
46-
RETRY_BACKOFF_FACTOR = 1
41+
4742

4843
# Search Queries
4944
SEARCH_QUERIES = [
@@ -337,8 +332,8 @@ def initialize_all_data_files(args):
337332
def get_requests_session():
338333
"""Create request session with retry logic"""
339334
retry_strategy = Retry(
340-
total=RETRY_TOTAL,
341-
backoff_factor=RETRY_BACKOFF_FACTOR,
335+
total=5,
336+
backoff_factor=1,
342337
status_forcelist=shared.STATUS_FORCELIST,
343338
)
344339
session = requests.Session()
@@ -485,7 +480,6 @@ def save_count_data(
485480
)
486481

487482
# Save aggregated category report (top N per license, rest -> Other)
488-
TOP_N = 10
489483
with open(
490484
FILE_ARXIV_CATEGORY_REPORT_AGGREGATE, "w", newline="", encoding="utf-8"
491485
) as fh:
@@ -506,8 +500,8 @@ def save_count_data(
506500
sorted_cats = sorted(
507501
cats.items(), key=lambda x: x[1], reverse=True
508502
)
509-
top = sorted_cats[:TOP_N]
510-
others = sorted_cats[TOP_N:]
503+
top = sorted_cats[:10]
504+
others = sorted_cats[10:]
511505
other_count = sum(c for _, c in others)
512506
for code, c in top:
513507
label = CATEGORIES.get(code, code)
@@ -572,7 +566,7 @@ def query_arxiv(args):
572566
LOGGER.info("Beginning to fetch results from ArXiv API")
573567
session = get_requests_session()
574568

575-
results_per_iteration = RESULTS_PER_REQUEST
569+
results_per_iteration = 50
576570

577571
search_queries = SEARCH_QUERIES
578572

@@ -593,7 +587,7 @@ def query_arxiv(args):
593587

594588
for start in range(
595589
0,
596-
min(args.limit - total_fetched, MAX_RESULTS_PER_QUERY),
590+
min(args.limit - total_fetched, 500),
597591
results_per_iteration,
598592
):
599593
encoded_query = urllib.parse.quote_plus(search_query)
@@ -647,7 +641,7 @@ def query_arxiv(args):
647641

648642
# arXiv recommends a 3-seconds delay between consecutive
649643
# api calls for efficiency
650-
time.sleep(API_DELAY_SECONDS)
644+
time.sleep(3)
651645
except requests.RequestException as e:
652646
LOGGER.error(f"Request failed: {e}")
653647
break

0 commit comments

Comments
 (0)