Skip to content

Commit 76d2184

Browse files
committed
Reorganize constants in logical order and fix static analysis issues
1 parent d04179b commit 76d2184

1 file changed

Lines changed: 22 additions & 17 deletions

File tree

scripts/1-fetch/arxiv_fetch.py

Lines changed: 22 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,16 @@
3838
BASE_URL = "http://export.arxiv.org/api/query?"
3939
DEFAULT_FETCH_LIMIT = 800 # Default total papers to fetch
4040

41-
41+
# CSV Headers
42+
HEADER_AUTHOR_BUCKET = ["TOOL_IDENTIFIER", "AUTHOR_BUCKET", "COUNT"]
43+
HEADER_CATEGORY_REPORT = [
44+
"TOOL_IDENTIFIER",
45+
"CATEGORY_CODE",
46+
"CATEGORY_LABEL",
47+
"COUNT",
48+
]
49+
HEADER_COUNT = ["TOOL_IDENTIFIER", "COUNT"]
50+
HEADER_YEAR = ["TOOL_IDENTIFIER", "YEAR", "COUNT"]
4251

4352
# Search Queries
4453
SEARCH_QUERIES = [
@@ -60,17 +69,6 @@
6069
'all:"CC-0"',
6170
]
6271

63-
# CSV Headers
64-
HEADER_COUNT = ["TOOL_IDENTIFIER", "COUNT"]
65-
HEADER_CATEGORY_REPORT = [
66-
"TOOL_IDENTIFIER",
67-
"CATEGORY_CODE",
68-
"CATEGORY_LABEL",
69-
"COUNT",
70-
]
71-
HEADER_YEAR = ["TOOL_IDENTIFIER", "YEAR", "COUNT"]
72-
HEADER_AUTHOR_BUCKET = ["TOOL_IDENTIFIER", "AUTHOR_BUCKET", "COUNT"]
73-
7472
# Compiled regex patterns for CC license detection
7573
CC_PATTERNS = [
7674
(re.compile(r"\bCC[-\s]?0\b", re.IGNORECASE), "CC0"),
@@ -264,7 +262,9 @@
264262
PATHS["data_1-fetch"], "arxiv_4_count_by_author_bucket.csv"
265263
)
266264
# records metadata for each run for audit, reproducibility, and provenance
267-
FILE_PROVENANCE = shared.path_join(PATHS["data_1-fetch"], "arxiv_provenance.yaml")
265+
FILE_PROVENANCE = shared.path_join(
266+
PATHS["data_1-fetch"], "arxiv_provenance.yaml"
267+
)
268268

269269
# Runtime variables
270270
QUARTER = os.path.basename(PATHS["data_quarter"])
@@ -273,7 +273,7 @@
273273
# parsing arguments function
274274
def parse_arguments():
275275
"""Parse command-line options, returns parsed argument namespace.
276-
276+
277277
Note: The --limit parameter sets the total number of papers to fetch
278278
across all search queries, not per query. ArXiv API recommends
279279
maximum of 30000 results per session for optimal performance.
@@ -287,8 +287,10 @@ def parse_arguments():
287287
help=(
288288
f"Total limit of papers to fetch across all search queries "
289289
f"(default: {DEFAULT_FETCH_LIMIT}). Maximum recommended: 30000. "
290-
f"Note: Individual queries limited to 500 results (implementation choice). "
291-
f"See ArXiv API documentation: https://info.arxiv.org/help/api/user-manual.html"
290+
f"Note: Individual queries limited to 500 results "
291+
f"(implementation choice). "
292+
f"See ArXiv API documentation: "
293+
f"https://info.arxiv.org/help/api/user-manual.html"
292294
),
293295
)
294296
parser.add_argument(
@@ -602,7 +604,10 @@ def query_arxiv(args):
602604
if papers_found_in_batch == 0:
603605
break
604606

605-
LOGGER.info(f"Query '{search_query}' completed: {papers_found_for_query} papers found")
607+
LOGGER.info(
608+
f"Query '{search_query}' completed: "
609+
f"{papers_found_for_query} papers found"
610+
)
606611

607612
# Save results
608613
if args.enable_save:

0 commit comments

Comments
 (0)