3939LOGGER , PATHS = shared .setup (__file__ )
4040
4141# Constants
42+ # API Configuration
4243BASE_URL = "http://export.arxiv.org/api/query?"
44+ API_DELAY_SECONDS = 3 # ArXiv recommended delay between API calls
45+ RESULTS_PER_REQUEST = 50 # Number of results per API request
46+ MAX_RESULTS_PER_QUERY = 500 # Maximum results to fetch per search query
47+ DEFAULT_FETCH_LIMIT = 800 # Default total papers to fetch
48+
49+ # HTTP Retry Configuration (using shared constants where available)
50+ RETRY_TOTAL = 5
51+ RETRY_BACKOFF_FACTOR = 1
52+ # STATUS_FORCELIST imported from shared.py
53+
54+ # Search Queries
55+ SEARCH_QUERIES = [
56+ 'all:"creative commons"' ,
57+ 'all:"CC BY"' ,
58+ 'all:"CC-BY"' ,
59+ 'all:"CC BY-NC"' ,
60+ 'all:"CC-BY-NC"' ,
61+ 'all:"CC BY-SA"' ,
62+ 'all:"CC-BY-SA"' ,
63+ 'all:"CC BY-ND"' ,
64+ 'all:"CC-BY-ND"' ,
65+ 'all:"CC BY-NC-SA"' ,
66+ 'all:"CC-BY-NC-SA"' ,
67+ 'all:"CC BY-NC-ND"' ,
68+ 'all:"CC-BY-NC-ND"' ,
69+ 'all:"CC0"' ,
70+ 'all:"CC 0"' ,
71+ 'all:"CC-0"' ,
72+ ]
73+
74+ # File Paths
4375FILE_ARXIV_COUNT = shared .path_join (PATHS ["data_1-fetch" ], "arxiv_1_count.csv" )
4476FILE_ARXIV_CATEGORY = shared .path_join (
4577 PATHS ["data_1-fetch" ], "arxiv_2_count_by_category.csv"
@@ -158,8 +190,8 @@ def parse_arguments():
158190 parser .add_argument (
159191 "--limit" ,
160192 type = int ,
161- default = 800 ,
162- help = "Limit number of papers to fetch (default: 800 )" ,
193+ default = DEFAULT_FETCH_LIMIT ,
194+ help = f "Limit number of papers to fetch (default: { DEFAULT_FETCH_LIMIT } )" ,
163195 )
164196 parser .add_argument (
165197 "--enable-save" ,
@@ -206,9 +238,9 @@ def initialize_all_data_files(args):
206238def get_requests_session ():
207239 """Create request session with retry logic"""
208240 retry_strategy = Retry (
209- total = 5 ,
210- backoff_factor = 1 ,
211- status_forcelist = [ 408 , 429 , 500 , 502 , 503 , 504 ] ,
241+ total = RETRY_TOTAL ,
242+ backoff_factor = RETRY_BACKOFF_FACTOR ,
243+ status_forcelist = shared . STATUS_FORCELIST ,
212244 )
213245 session = requests .Session ()
214246 session .headers .update ({"User-Agent" : shared .USER_AGENT })
@@ -262,8 +294,8 @@ def extract_year_from_entry(entry):
262294 if hasattr (entry , "published" ) and entry .published :
263295 try :
264296 return entry .published [:4 ] # Extract year from date string
265- except (AttributeError , IndexError ):
266- pass
297+ except (AttributeError , IndexError ) as e :
298+ LOGGER . debug ( f"Failed to extract year from entry.published ' { entry . published } ': { e } " )
267299 return "Unknown"
268300
269301
@@ -272,8 +304,8 @@ def extract_author_count_from_entry(entry):
272304 if hasattr (entry , "authors" ) and entry .authors :
273305 try :
274306 return len (entry .authors )
275- except Exception :
276- pass
307+ except Exception as e :
308+ LOGGER . debug ( f"Failed to count authors from entry.authors: { e } " )
277309 if hasattr (entry , "author" ) and entry .author :
278310 return 1
279311 return "Unknown"
@@ -451,26 +483,9 @@ def query_arxiv(args):
451483 CATEGORY_LABELS .update (loaded )
452484 except Exception as e :
453485 LOGGER .warning ("Error loading external arXiv category map: %s" , e )
454- results_per_iteration = 50
455-
456- search_queries = [
457- 'all:"creative commons"' ,
458- 'all:"CC BY"' ,
459- 'all:"CC-BY"' ,
460- 'all:"CC BY-NC"' ,
461- 'all:"CC-BY-NC"' ,
462- 'all:"CC BY-SA"' ,
463- 'all:"CC-BY-SA"' ,
464- 'all:"CC BY-ND"' ,
465- 'all:"CC-BY-ND"' ,
466- 'all:"CC BY-NC-SA"' ,
467- 'all:"CC-BY-NC-SA"' ,
468- 'all:"CC BY-NC-ND"' ,
469- 'all:"CC-BY-NC-ND"' ,
470- 'all:"CC0"' ,
471- 'all:"CC 0"' ,
472- 'all:"CC-0"' ,
473- ]
486+ results_per_iteration = RESULTS_PER_REQUEST
487+
488+ search_queries = SEARCH_QUERIES
474489
475490 # Data structures for counting
476491 license_counts = defaultdict (int )
@@ -488,7 +503,7 @@ def query_arxiv(args):
488503 consecutive_empty_calls = 0
489504
490505 for start in range (
491- 0 , min (args .limit - total_fetched , 500 ), results_per_iteration
506+ 0 , min (args .limit - total_fetched , MAX_RESULTS_PER_QUERY ), results_per_iteration
492507 ):
493508 encoded_query = urllib .parse .quote_plus (search_query )
494509 query = (
@@ -541,7 +556,7 @@ def query_arxiv(args):
541556
542557 # arXiv recommends a 3-seconds delay between consecutive
543558 # api calls for efficiency
544- time .sleep (3 )
559+ time .sleep (API_DELAY_SECONDS )
545560 except requests .RequestException as e :
546561 LOGGER .error (f"Request failed: { e } " )
547562 break
0 commit comments