Skip to content

Commit 95df48a

Browse files
committed
Improve arxiv_fetch.py: add debug logging, organize constants, use shared STATUS_FORCELIST
1 parent 6769a33 commit 95df48a

1 file changed

Lines changed: 46 additions & 31 deletions

File tree

scripts/1-fetch/arxiv_fetch.py

Lines changed: 46 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,39 @@
3939
LOGGER, PATHS = shared.setup(__file__)
4040

4141
# Constants
42+
# API Configuration
4243
BASE_URL = "http://export.arxiv.org/api/query?"
44+
API_DELAY_SECONDS = 3 # ArXiv recommended delay between API calls
45+
RESULTS_PER_REQUEST = 50 # Number of results per API request
46+
MAX_RESULTS_PER_QUERY = 500 # Maximum results to fetch per search query
47+
DEFAULT_FETCH_LIMIT = 800 # Default total papers to fetch
48+
49+
# HTTP Retry Configuration (using shared constants where available)
50+
RETRY_TOTAL = 5
51+
RETRY_BACKOFF_FACTOR = 1
52+
# STATUS_FORCELIST imported from shared.py
53+
54+
# Search Queries
55+
SEARCH_QUERIES = [
56+
'all:"creative commons"',
57+
'all:"CC BY"',
58+
'all:"CC-BY"',
59+
'all:"CC BY-NC"',
60+
'all:"CC-BY-NC"',
61+
'all:"CC BY-SA"',
62+
'all:"CC-BY-SA"',
63+
'all:"CC BY-ND"',
64+
'all:"CC-BY-ND"',
65+
'all:"CC BY-NC-SA"',
66+
'all:"CC-BY-NC-SA"',
67+
'all:"CC BY-NC-ND"',
68+
'all:"CC-BY-NC-ND"',
69+
'all:"CC0"',
70+
'all:"CC 0"',
71+
'all:"CC-0"',
72+
]
73+
74+
# File Paths
4375
FILE_ARXIV_COUNT = shared.path_join(PATHS["data_1-fetch"], "arxiv_1_count.csv")
4476
FILE_ARXIV_CATEGORY = shared.path_join(
4577
PATHS["data_1-fetch"], "arxiv_2_count_by_category.csv"
@@ -158,8 +190,8 @@ def parse_arguments():
158190
parser.add_argument(
159191
"--limit",
160192
type=int,
161-
default=800,
162-
help="Limit number of papers to fetch (default: 800)",
193+
default=DEFAULT_FETCH_LIMIT,
194+
help=f"Limit number of papers to fetch (default: {DEFAULT_FETCH_LIMIT})",
163195
)
164196
parser.add_argument(
165197
"--enable-save",
@@ -206,9 +238,9 @@ def initialize_all_data_files(args):
206238
def get_requests_session():
207239
"""Create request session with retry logic"""
208240
retry_strategy = Retry(
209-
total=5,
210-
backoff_factor=1,
211-
status_forcelist=[408, 429, 500, 502, 503, 504],
241+
total=RETRY_TOTAL,
242+
backoff_factor=RETRY_BACKOFF_FACTOR,
243+
status_forcelist=shared.STATUS_FORCELIST,
212244
)
213245
session = requests.Session()
214246
session.headers.update({"User-Agent": shared.USER_AGENT})
@@ -262,8 +294,8 @@ def extract_year_from_entry(entry):
262294
if hasattr(entry, "published") and entry.published:
263295
try:
264296
return entry.published[:4] # Extract year from date string
265-
except (AttributeError, IndexError):
266-
pass
297+
except (AttributeError, IndexError) as e:
298+
LOGGER.debug(f"Failed to extract year from entry.published '{entry.published}': {e}")
267299
return "Unknown"
268300

269301

@@ -272,8 +304,8 @@ def extract_author_count_from_entry(entry):
272304
if hasattr(entry, "authors") and entry.authors:
273305
try:
274306
return len(entry.authors)
275-
except Exception:
276-
pass
307+
except Exception as e:
308+
LOGGER.debug(f"Failed to count authors from entry.authors: {e}")
277309
if hasattr(entry, "author") and entry.author:
278310
return 1
279311
return "Unknown"
@@ -451,26 +483,9 @@ def query_arxiv(args):
451483
CATEGORY_LABELS.update(loaded)
452484
except Exception as e:
453485
LOGGER.warning("Error loading external arXiv category map: %s", e)
454-
results_per_iteration = 50
455-
456-
search_queries = [
457-
'all:"creative commons"',
458-
'all:"CC BY"',
459-
'all:"CC-BY"',
460-
'all:"CC BY-NC"',
461-
'all:"CC-BY-NC"',
462-
'all:"CC BY-SA"',
463-
'all:"CC-BY-SA"',
464-
'all:"CC BY-ND"',
465-
'all:"CC-BY-ND"',
466-
'all:"CC BY-NC-SA"',
467-
'all:"CC-BY-NC-SA"',
468-
'all:"CC BY-NC-ND"',
469-
'all:"CC-BY-NC-ND"',
470-
'all:"CC0"',
471-
'all:"CC 0"',
472-
'all:"CC-0"',
473-
]
486+
results_per_iteration = RESULTS_PER_REQUEST
487+
488+
search_queries = SEARCH_QUERIES
474489

475490
# Data structures for counting
476491
license_counts = defaultdict(int)
@@ -488,7 +503,7 @@ def query_arxiv(args):
488503
consecutive_empty_calls = 0
489504

490505
for start in range(
491-
0, min(args.limit - total_fetched, 500), results_per_iteration
506+
0, min(args.limit - total_fetched, MAX_RESULTS_PER_QUERY), results_per_iteration
492507
):
493508
encoded_query = urllib.parse.quote_plus(search_query)
494509
query = (
@@ -541,7 +556,7 @@ def query_arxiv(args):
541556

542557
# arXiv recommends a 3-seconds delay between consecutive
543558
# api calls for efficiency
544-
time.sleep(3)
559+
time.sleep(API_DELAY_SECONDS)
545560
except requests.RequestException as e:
546561
LOGGER.error(f"Request failed: {e}")
547562
break

0 commit comments

Comments
 (0)