Skip to content

Commit 2ef6c6f

Browse files
committed
refactor: use shared category functions in arxiv_fetch.py
1 parent 9183088 commit 2ef6c6f

1 file changed

Lines changed: 37 additions & 126 deletions

File tree

scripts/1-fetch/arxiv_fetch.py

Lines changed: 37 additions & 126 deletions
Original file line numberDiff line numberDiff line change
@@ -26,11 +26,6 @@
2626

2727
# Add parent directory so shared can be imported
2828
sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
29-
# Add dev directory for category converter
30-
sys.path.append(os.path.join(os.path.dirname(__file__), "..", "..", "dev"))
31-
32-
# Third-party
33-
import arxiv_category_converter # noqa: E402
3429

3530
# First-party/Local
3631
import shared # noqa: E402
@@ -49,7 +44,7 @@
4944
# HTTP Retry Configuration (using shared constants where available)
5045
RETRY_TOTAL = 5
5146
RETRY_BACKOFF_FACTOR = 1
52-
# STATUS_FORCELIST imported from shared.py
47+
5348

5449
# Search Queries
5550
SEARCH_QUERIES = [
@@ -73,9 +68,6 @@
7368

7469
# File Paths
7570
FILE_ARXIV_COUNT = shared.path_join(PATHS["data_1-fetch"], "arxiv_1_count.csv")
76-
FILE_ARXIV_CATEGORY = shared.path_join(
77-
PATHS["data_1-fetch"], "arxiv_2_count_by_category.csv"
78-
)
7971
FILE_ARXIV_CATEGORY_REPORT = shared.path_join(
8072
PATHS["data_1-fetch"], "arxiv_2_count_by_category_report.csv"
8173
)
@@ -85,17 +77,13 @@
8577
FILE_ARXIV_YEAR = shared.path_join(
8678
PATHS["data_1-fetch"], "arxiv_3_count_by_year.csv"
8779
)
88-
FILE_ARXIV_AUTHOR = shared.path_join(
89-
PATHS["data_1-fetch"], "arxiv_4_count_by_author_count.csv"
90-
)
9180
FILE_ARXIV_AUTHOR_BUCKET = shared.path_join(
9281
PATHS["data_1-fetch"], "arxiv_4_count_by_author_bucket.csv"
9382
)
9483
# records metadata for each run for audit, reproducibility, and provenance
9584
FILE_PROVENANCE = shared.path_join(PATHS["data"], "arxiv_provenance.yaml")
9685

9786
HEADER_COUNT = ["TOOL_IDENTIFIER", "COUNT"]
98-
HEADER_CATEGORY = ["TOOL_IDENTIFIER", "CATEGORY", "COUNT"]
9987
HEADER_CATEGORY_REPORT = [
10088
"TOOL_IDENTIFIER",
10189
"CATEGORY_CODE",
@@ -104,7 +92,6 @@
10492
"PERCENT",
10593
]
10694
HEADER_YEAR = ["TOOL_IDENTIFIER", "YEAR", "COUNT"]
107-
HEADER_AUTHOR = ["TOOL_IDENTIFIER", "AUTHOR_COUNT", "COUNT"]
10895
HEADER_AUTHOR_BUCKET = ["TOOL_IDENTIFIER", "AUTHOR_BUCKET", "COUNT"]
10996

11097
QUARTER = os.path.basename(PATHS["data_quarter"])
@@ -136,52 +123,6 @@
136123
LOGGER.info("Script execution started.")
137124

138125

139-
def load_category_map(paths):
140-
"""Load category->label mapping from data/arxiv_category_map.yaml.
141-
Returns a dict (possibly empty) and logs failures silently.
142-
"""
143-
paths_to_check = []
144-
# use the repository data directory
145-
repository_data_dir = (
146-
paths.get("data") if isinstance(paths, dict) else None
147-
)
148-
if repository_data_dir:
149-
paths_to_check.append(
150-
os.path.join(repository_data_dir, "arxiv_category_map.yaml")
151-
)
152-
153-
# allow for looking two levels up (data/)
154-
paths_to_check.append(
155-
os.path.join(
156-
os.path.dirname(__file__),
157-
"..",
158-
"..",
159-
"data",
160-
"arxiv_category_map.yaml",
161-
)
162-
)
163-
164-
for p in paths_to_check:
165-
p = os.path.abspath(os.path.realpath(p))
166-
try:
167-
if os.path.exists(p):
168-
with open(p, "r", encoding="utf-8") as fh:
169-
data = yaml.safe_load(fh)
170-
if isinstance(data, dict):
171-
# Normalise keys/values to strings for readability
172-
return {str(k).strip(): str(v) for k, v in data.items()}
173-
except Exception as e:
174-
LOGGER = globals().get("LOGGER")
175-
if LOGGER:
176-
LOGGER.warning("Failed to load category map %s: %s", p, e)
177-
else:
178-
print(
179-
f"Warning: Failed to load category map {p}: {e}",
180-
file=sys.stderr,
181-
)
182-
return {}
183-
184-
185126
# parsing arguments function
186127
def parse_arguments():
187128
"""Parse command-line options, returns parsed argument namespace."""
@@ -229,9 +170,8 @@ def initialize_all_data_files(args):
229170

230171
os.makedirs(PATHS["data_1-fetch"], exist_ok=True)
231172
initialize_data_file(FILE_ARXIV_COUNT, HEADER_COUNT)
232-
initialize_data_file(FILE_ARXIV_CATEGORY, HEADER_CATEGORY)
173+
initialize_data_file(FILE_ARXIV_CATEGORY_REPORT, HEADER_CATEGORY_REPORT)
233174
initialize_data_file(FILE_ARXIV_YEAR, HEADER_YEAR)
234-
initialize_data_file(FILE_ARXIV_AUTHOR, HEADER_AUTHOR)
235175
initialize_data_file(FILE_ARXIV_AUTHOR_BUCKET, HEADER_AUTHOR_BUCKET)
236176

237177

@@ -249,7 +189,12 @@ def get_requests_session():
249189

250190

251191
def normalize_license_text(raw_text):
252-
"""Normalize license text to standard CC license identifiers."""
192+
"""
193+
Convert raw license text to standardized CC license identifiers.
194+
195+
Uses regex patterns to identify CC licenses from paper text.
196+
Returns specific license (e.g., "CC BY", "CC0") or "Unknown".
197+
"""
253198
if not raw_text:
254199
return "Unknown"
255200

@@ -261,7 +206,12 @@ def normalize_license_text(raw_text):
261206

262207

263208
def extract_license_info(entry):
264-
"""Extract CC license information from ArXiv entry."""
209+
"""
210+
Extract CC license information from ArXiv paper entry.
211+
212+
Checks rights field first, then summary field for license patterns.
213+
Returns normalized license identifier or "Unknown".
214+
"""
265215
# checking through the rights field first then summary
266216
if hasattr(entry, "rights") and entry.rights:
267217
license_info = normalize_license_text(entry.rights)
@@ -314,6 +264,12 @@ def extract_author_count_from_entry(entry):
314264

315265

316266
def bucket_author_count(n):
267+
"""
268+
Convert author count to predefined buckets for analysis.
269+
270+
Buckets: "1", "2-3", "4-6", "7-10", "11+", "Unknown"
271+
Reduces granularity for better statistical analysis.
272+
"""
317273
if n is None:
318274
return "Unknown"
319275
if n == 1:
@@ -330,6 +286,10 @@ def bucket_author_count(n):
330286
def save_count_data(
331287
license_counts, category_counts, year_counts, author_counts
332288
):
289+
"""
290+
Save all collected data to CSV files.
291+
292+
"""
333293
# license_counts: {license: count}
334294
# category_counts: {license: {category_code: count}}
335295
# year_counts: {license: {year: count}}
@@ -342,16 +302,6 @@ def save_count_data(
342302
for lic, c in license_counts.items():
343303
writer.writerow({"TOOL_IDENTIFIER": lic, "COUNT": c})
344304

345-
# Save detailed category counts (code)
346-
with open(FILE_ARXIV_CATEGORY, "w", newline="", encoding="utf-8") as fh:
347-
writer = csv.DictWriter(fh, fieldnames=HEADER_CATEGORY, dialect="unix")
348-
writer.writeheader()
349-
for lic, cats in category_counts.items():
350-
for code, c in cats.items():
351-
writer.writerow(
352-
{"TOOL_IDENTIFIER": lic, "CATEGORY": code, "COUNT": c}
353-
)
354-
355305
# Save category report with labels and percent
356306
with open(
357307
FILE_ARXIV_CATEGORY_REPORT, "w", newline="", encoding="utf-8"
@@ -363,14 +313,7 @@ def save_count_data(
363313
for lic, cats in category_counts.items():
364314
total_for_license = sum(cats.values()) or 1
365315
for code, c in cats.items():
366-
label = CATEGORY_LABELS.get(
367-
code,
368-
(
369-
code.split(".")[0].upper()
370-
if code and "." in code
371-
else code
372-
),
373-
)
316+
label = shared.normalize_arxiv_category(code, CATEGORY_LABELS)
374317
pct = round((c / total_for_license) * 100, 2)
375318
writer.writerow(
376319
{
@@ -391,6 +334,7 @@ def save_count_data(
391334
fh,
392335
fieldnames=[
393336
"TOOL_IDENTIFIER",
337+
"CATEGORY_CODE",
394338
"CATEGORY_LABEL",
395339
"COUNT",
396340
"PERCENT",
@@ -407,17 +351,11 @@ def save_count_data(
407351
others = sorted_cats[TOP_N:]
408352
other_count = sum(c for _, c in others)
409353
for code, c in top:
410-
label = CATEGORY_LABELS.get(
411-
code,
412-
(
413-
code.split(".")[0].upper()
414-
if code and "." in code
415-
else code
416-
),
417-
)
354+
label = shared.normalize_arxiv_category(code, CATEGORY_LABELS)
418355
writer.writerow(
419356
{
420357
"TOOL_IDENTIFIER": lic,
358+
"CATEGORY_CODE": code,
421359
"CATEGORY_LABEL": label,
422360
"COUNT": c,
423361
"PERCENT": round((c / total_for_license) * 100, 2),
@@ -427,6 +365,7 @@ def save_count_data(
427365
writer.writerow(
428366
{
429367
"TOOL_IDENTIFIER": lic,
368+
"CATEGORY_CODE": "OTHER",
430369
"CATEGORY_LABEL": "Other",
431370
"COUNT": other_count,
432371
"PERCENT": round(
@@ -445,20 +384,6 @@ def save_count_data(
445384
{"TOOL_IDENTIFIER": lic, "YEAR": year, "COUNT": c}
446385
)
447386

448-
# Save detailed author counts (AUTHOR_COUNT as integer or Unknown)
449-
with open(FILE_ARXIV_AUTHOR, "w", newline="", encoding="utf-8") as fh:
450-
writer = csv.DictWriter(fh, fieldnames=HEADER_AUTHOR, dialect="unix")
451-
writer.writeheader()
452-
for lic, acs in author_counts.items():
453-
for ac, c in acs.items():
454-
writer.writerow(
455-
{
456-
"TOOL_IDENTIFIER": lic,
457-
"AUTHOR_COUNT": ac if ac is not None else "Unknown",
458-
"COUNT": c,
459-
}
460-
)
461-
462387
# Save author buckets summary
463388
with open(
464389
FILE_ARXIV_AUTHOR_BUCKET, "w", newline="", encoding="utf-8"
@@ -480,17 +405,17 @@ def save_count_data(
480405

481406

482407
def query_arxiv(args):
483-
"""Query ArXiv API for papers with potential CC licenses."""
408+
"""
409+
Main function to query ArXiv API and collect CC license data.
410+
411+
"""
484412

485413
LOGGER.info("Beginning to fetch results from ArXiv API")
486414
session = get_requests_session()
487-
try:
488-
loaded = load_category_map(PATHS)
489-
if loaded:
490-
# overlay loaded map over default
491-
CATEGORY_LABELS.update(loaded)
492-
except Exception as e:
493-
LOGGER.warning("Error loading external arXiv category map: %s", e)
415+
416+
# Load category mappings using shared function
417+
CATEGORY_LABELS.update(shared.load_arxiv_categories(PATHS.get("data")))
418+
494419
results_per_iteration = RESULTS_PER_REQUEST
495420

496421
search_queries = SEARCH_QUERIES
@@ -584,25 +509,11 @@ def query_arxiv(args):
584509
consecutive_empty_calls = 0
585510

586511
# Save results
587-
588512
if args.enable_save:
589513
save_count_data(
590514
license_counts, category_counts, year_counts, author_counts
591515
)
592516

593-
# Convert category codes to user-friendly names
594-
try:
595-
input_file = FILE_ARXIV_CATEGORY
596-
output_file = shared.path_join(
597-
PATHS["data_1-fetch"], "arxiv_2_count_by_category_report.csv"
598-
)
599-
arxiv_category_converter.convert_categories_to_friendly_names(
600-
input_file, output_file, PATHS["data"]
601-
)
602-
LOGGER.info(f"Category conversion completed: {output_file}")
603-
except Exception as e:
604-
LOGGER.warning(f"Category conversion failed: {e}")
605-
606517
# save provenance
607518
provenance_data = {
608519
"total_fetched": total_fetched,

0 commit comments

Comments
 (0)