Skip to content

Commit 587e2e0

Browse files
committed
Fix static analysis issues in arxiv_fetch.py - line length and formatting
1 parent 4fb8f30 commit 587e2e0

1 file changed

Lines changed: 117 additions & 83 deletions

File tree

scripts/1-fetch/arxiv_fetch.py

Lines changed: 117 additions & 83 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
import time
1414
import traceback
1515
import urllib.parse
16-
from collections import defaultdict, Counter
16+
from collections import Counter, defaultdict
1717

1818
# Third-party
1919
import feedparser
@@ -25,15 +25,16 @@
2525
from requests.adapters import HTTPAdapter
2626
from urllib3.util.retry import Retry
2727

28-
2928
# Add parent directory so shared can be imported
3029
sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
3130
# Add dev directory for category converter
3231
sys.path.append(os.path.join(os.path.dirname(__file__), "..", "..", "dev"))
3332

33+
# Third-party
34+
import arxiv_category_converter # noqa: E402
35+
3436
# First-party/Local
3537
import shared # noqa: E402
36-
import arxiv_category_converter # noqa: E402
3738

3839
# Setup
3940
LOGGER, PATHS = shared.setup(__file__)
@@ -60,7 +61,9 @@
6061
PATHS["data_1-fetch"], "arxiv_4_count_by_author_bucket.csv"
6162
)
6263
# records metadata for each run for audit, reproducibility, and provenance
63-
FILE_PROVENANCE = shared.path_join(PATHS["data_1-fetch"], "arxiv_provenance.json")
64+
FILE_PROVENANCE = shared.path_join(
65+
PATHS["data_1-fetch"], "arxiv_provenance.json"
66+
)
6467

6568
HEADER_COUNT = ["TOOL_IDENTIFIER", "COUNT"]
6669
HEADER_CATEGORY = ["TOOL_IDENTIFIER", "CATEGORY", "COUNT"]
@@ -81,25 +84,31 @@
8184

8285
# Compiled regex patterns for CC license detection
8386
CC_PATTERNS = [
84-
(re.compile(r'\bCC[-\s]?0\b', re.IGNORECASE), "CC0"),
85-
(re.compile(r'\bCC[-\s]?BY[-\s]?NC[-\s]?ND\b', re.IGNORECASE),
86-
"CC BY-NC-ND"),
87-
(re.compile(r'\bCC[-\s]?BY[-\s]?NC[-\s]?SA\b', re.IGNORECASE),
88-
"CC BY-NC-SA"),
89-
(re.compile(r'\bCC[-\s]?BY[-\s]?ND\b', re.IGNORECASE), "CC BY-ND"),
90-
(re.compile(r'\bCC[-\s]?BY[-\s]?SA\b', re.IGNORECASE), "CC BY-SA"),
91-
(re.compile(r'\bCC[-\s]?BY[-\s]?NC\b', re.IGNORECASE), "CC BY-NC"),
92-
(re.compile(r'\bCC[-\s]?BY\b', re.IGNORECASE), "CC BY"),
93-
(re.compile(r'\bCREATIVE\s+COMMONS\b', re.IGNORECASE),
94-
"UNKNOWN CC legal tool"),
87+
(re.compile(r"\bCC[-\s]?0\b", re.IGNORECASE), "CC0"),
88+
(
89+
re.compile(r"\bCC[-\s]?BY[-\s]?NC[-\s]?ND\b", re.IGNORECASE),
90+
"CC BY-NC-ND",
91+
),
92+
(
93+
re.compile(r"\bCC[-\s]?BY[-\s]?NC[-\s]?SA\b", re.IGNORECASE),
94+
"CC BY-NC-SA",
95+
),
96+
(re.compile(r"\bCC[-\s]?BY[-\s]?ND\b", re.IGNORECASE), "CC BY-ND"),
97+
(re.compile(r"\bCC[-\s]?BY[-\s]?SA\b", re.IGNORECASE), "CC BY-SA"),
98+
(re.compile(r"\bCC[-\s]?BY[-\s]?NC\b", re.IGNORECASE), "CC BY-NC"),
99+
(re.compile(r"\bCC[-\s]?BY\b", re.IGNORECASE), "CC BY"),
100+
(
101+
re.compile(r"\bCREATIVE\s+COMMONS\b", re.IGNORECASE),
102+
"UNKNOWN CC legal tool",
103+
),
95104
]
96105

97106
# Log the start of the script execution
98107
LOGGER.info("Script execution started.")
99108

100109

101110
def load_category_map(paths):
102-
"""Load category->label mapping from data/arxiv_category_map.yaml if present
111+
"""Load category->label mapping from data/arxiv_category_map.yaml.
103112
Returns a dict (possibly empty) and logs failures silently.
104113
"""
105114
paths_to_check = []
@@ -115,8 +124,11 @@ def load_category_map(paths):
115124
# allow for looking two levels up (data/)
116125
paths_to_check.append(
117126
os.path.join(
118-
os.path.dirname(__file__), "..", "..", "data",
119-
"arxiv_category_map.yaml"
127+
os.path.dirname(__file__),
128+
"..",
129+
"..",
130+
"data",
131+
"arxiv_category_map.yaml",
120132
)
121133
)
122134

@@ -136,7 +148,7 @@ def load_category_map(paths):
136148
else:
137149
print(
138150
f"Warning: Failed to load category map {p}: {e}",
139-
file=sys.stderr
151+
file=sys.stderr,
140152
)
141153
return {}
142154

@@ -199,7 +211,7 @@ def get_requests_session():
199211
retry_strategy = Retry(
200212
total=5,
201213
backoff_factor=1,
202-
status_forcelist=[408, 429, 500, 502, 503, 504]
214+
status_forcelist=[408, 429, 500, 502, 503, 504],
203215
)
204216
session = requests.Session()
205217
session.headers.update({"User-Agent": shared.USER_AGENT})
@@ -208,7 +220,7 @@ def get_requests_session():
208220

209221

210222
def normalize_license_text(raw_text: str) -> str:
211-
"""Normalize license text to standard CC license identifiers using regex."""
223+
"""Normalize license text to standard CC license identifiers."""
212224
if not raw_text:
213225
return "Unknown"
214226

@@ -233,11 +245,12 @@ def extract_license_info(entry):
233245
return "Unknown"
234246

235247

236-
237248
def extract_category_from_entry(entry):
238249
"""Extract primary category from ArXiv entry."""
239-
if (hasattr(entry, "arxiv_primary_category") and
240-
entry.arxiv_primary_category):
250+
if (
251+
hasattr(entry, "arxiv_primary_category")
252+
and entry.arxiv_primary_category
253+
):
241254
return entry.arxiv_primary_category.get("term", "Unknown")
242255
if hasattr(entry, "tags") and entry.tags:
243256
# Get first category from tags
@@ -283,8 +296,9 @@ def bucket_author_count(n):
283296
return "11+"
284297

285298

286-
def save_count_data(license_counts, category_counts, year_counts,
287-
author_counts):
299+
def save_count_data(
300+
license_counts, category_counts, year_counts, author_counts
301+
):
288302
# license_counts: {license: count}
289303
# category_counts: {license: {category_code: count}}
290304
# year_counts: {license: {year: count}}
@@ -299,103 +313,122 @@ def save_count_data(license_counts, category_counts, year_counts,
299313

300314
# Save detailed category counts (code)
301315
with open(FILE_ARXIV_CATEGORY, "w", newline="") as fh:
302-
writer = csv.DictWriter(fh, fieldnames=HEADER_CATEGORY,
303-
dialect="unix")
316+
writer = csv.DictWriter(fh, fieldnames=HEADER_CATEGORY, dialect="unix")
304317
writer.writeheader()
305318
for lic, cats in category_counts.items():
306319
for code, c in cats.items():
307-
writer.writerow({
308-
"TOOL_IDENTIFIER": lic,
309-
"CATEGORY": code,
310-
"COUNT": c
311-
})
320+
writer.writerow(
321+
{"TOOL_IDENTIFIER": lic, "CATEGORY": code, "COUNT": c}
322+
)
312323

313324
# Save category report with labels and percent
314325
with open(FILE_ARXIV_CATEGORY_REPORT, "w", newline="") as fh:
315-
writer = csv.DictWriter(fh, fieldnames=HEADER_CATEGORY_REPORT,
316-
dialect="unix")
326+
writer = csv.DictWriter(
327+
fh, fieldnames=HEADER_CATEGORY_REPORT, dialect="unix"
328+
)
317329
writer.writeheader()
318330
for lic, cats in category_counts.items():
319331
total_for_license = sum(cats.values()) or 1
320332
for code, c in cats.items():
321333
label = CATEGORY_LABELS.get(
322334
code,
323-
code.split(".")[0].upper() if code and "." in code else code
335+
(
336+
code.split(".")[0].upper()
337+
if code and "." in code
338+
else code
339+
),
324340
)
325341
pct = round((c / total_for_license) * 100, 2)
326-
writer.writerow({
327-
"TOOL_IDENTIFIER": lic,
328-
"CATEGORY_CODE": code,
329-
"CATEGORY_LABEL": label,
330-
"COUNT": c,
331-
"PERCENT": pct,
332-
})
342+
writer.writerow(
343+
{
344+
"TOOL_IDENTIFIER": lic,
345+
"CATEGORY_CODE": code,
346+
"CATEGORY_LABEL": label,
347+
"COUNT": c,
348+
"PERCENT": pct,
349+
}
350+
)
333351

334352
# Save aggregated category report (top N per license, rest -> Other)
335353
TOP_N = 10
336354
with open(FILE_ARXIV_CATEGORY_REPORT_AGGREGATE, "w", newline="") as fh:
337355
writer = csv.DictWriter(
338356
fh,
339-
fieldnames=["TOOL_IDENTIFIER", "CATEGORY_LABEL", "COUNT",
340-
"PERCENT"],
341-
dialect="unix"
357+
fieldnames=[
358+
"TOOL_IDENTIFIER",
359+
"CATEGORY_LABEL",
360+
"COUNT",
361+
"PERCENT",
362+
],
363+
dialect="unix",
342364
)
343365
writer.writeheader()
344366
for lic, cats in category_counts.items():
345367
total_for_license = sum(cats.values()) or 1
346-
sorted_cats = sorted(cats.items(), key=lambda x: x[1],
347-
reverse=True)
368+
sorted_cats = sorted(
369+
cats.items(), key=lambda x: x[1], reverse=True
370+
)
348371
top = sorted_cats[:TOP_N]
349372
others = sorted_cats[TOP_N:]
350373
other_count = sum(c for _, c in others)
351374
for code, c in top:
352375
label = CATEGORY_LABELS.get(
353376
code,
354-
code.split(".")[0].upper() if code and "." in code else code
377+
(
378+
code.split(".")[0].upper()
379+
if code and "." in code
380+
else code
381+
),
382+
)
383+
writer.writerow(
384+
{
385+
"TOOL_IDENTIFIER": lic,
386+
"CATEGORY_LABEL": label,
387+
"COUNT": c,
388+
"PERCENT": round((c / total_for_license) * 100, 2),
389+
}
355390
)
356-
writer.writerow({
357-
"TOOL_IDENTIFIER": lic,
358-
"CATEGORY_LABEL": label,
359-
"COUNT": c,
360-
"PERCENT": round((c / total_for_license) * 100, 2),
361-
})
362391
if other_count:
363-
writer.writerow({
364-
"TOOL_IDENTIFIER": lic,
365-
"CATEGORY_LABEL": "Other",
366-
"COUNT": other_count,
367-
"PERCENT": round((other_count / total_for_license) * 100, 2),
368-
})
392+
writer.writerow(
393+
{
394+
"TOOL_IDENTIFIER": lic,
395+
"CATEGORY_LABEL": "Other",
396+
"COUNT": other_count,
397+
"PERCENT": round(
398+
(other_count / total_for_license) * 100, 2
399+
),
400+
}
401+
)
369402

370403
# Save year counts
371404
with open(FILE_ARXIV_YEAR, "w", newline="") as fh:
372405
writer = csv.DictWriter(fh, fieldnames=HEADER_YEAR, dialect="unix")
373406
writer.writeheader()
374407
for lic, years in year_counts.items():
375408
for year, c in years.items():
376-
writer.writerow({
377-
"TOOL_IDENTIFIER": lic,
378-
"YEAR": year,
379-
"COUNT": c
380-
})
409+
writer.writerow(
410+
{"TOOL_IDENTIFIER": lic, "YEAR": year, "COUNT": c}
411+
)
381412

382413
# Save detailed author counts (AUTHOR_COUNT as integer or Unknown)
383414
with open(FILE_ARXIV_AUTHOR, "w", newline="") as fh:
384-
writer = csv.DictWriter(fh, fieldnames=HEADER_AUTHOR,
385-
dialect="unix")
415+
writer = csv.DictWriter(fh, fieldnames=HEADER_AUTHOR, dialect="unix")
386416
writer.writeheader()
387417
for lic, acs in author_counts.items():
388418
for ac, c in acs.items():
389-
writer.writerow({
390-
"TOOL_IDENTIFIER": lic,
391-
"AUTHOR_COUNT": ac if ac is not None else "Unknown",
392-
"COUNT": c
393-
})
419+
writer.writerow(
420+
{
421+
"TOOL_IDENTIFIER": lic,
422+
"AUTHOR_COUNT": ac if ac is not None else "Unknown",
423+
"COUNT": c,
424+
}
425+
)
394426

395427
# Save author buckets summary
396428
with open(FILE_ARXIV_AUTHOR_BUCKET, "w", newline="") as fh:
397-
writer = csv.DictWriter(fh, fieldnames=HEADER_AUTHOR_BUCKET,
398-
dialect="unix")
429+
writer = csv.DictWriter(
430+
fh, fieldnames=HEADER_AUTHOR_BUCKET, dialect="unix"
431+
)
399432
writer.writeheader()
400433
# build buckets across licenses
401434
for lic, acs in author_counts.items():
@@ -404,11 +437,9 @@ def save_count_data(license_counts, category_counts, year_counts,
404437
b = bucket_author_count(ac)
405438
bucket_counts[b] += c
406439
for b, c in bucket_counts.items():
407-
writer.writerow({
408-
"TOOL_IDENTIFIER": lic,
409-
"AUTHOR_BUCKET": b,
410-
"COUNT": c
411-
})
440+
writer.writerow(
441+
{"TOOL_IDENTIFIER": lic, "AUTHOR_BUCKET": b, "COUNT": c}
442+
)
412443

413444

414445
def query_arxiv(args):
@@ -523,7 +554,8 @@ def query_arxiv(args):
523554
if consecutive_empty_calls >= 2:
524555
LOGGER.info(
525556
f"No new papers in 2 consecutive calls for "
526-
f"query: {search_query}. Moving over to the next query."
557+
f"query: {search_query}. "
558+
f"Moving over to the next query."
527559
)
528560
break
529561
else:
@@ -535,11 +567,13 @@ def query_arxiv(args):
535567
save_count_data(
536568
license_counts, category_counts, year_counts, author_counts
537569
)
538-
570+
539571
# Convert category codes to user-friendly names
540572
try:
541573
input_file = FILE_ARXIV_CATEGORY
542-
output_file = shared.path_join(PATHS["data_1-fetch"], "arxiv_2_count_by_category_report.csv")
574+
output_file = shared.path_join(
575+
PATHS["data_1-fetch"], "arxiv_2_count_by_category_report.csv"
576+
)
543577
arxiv_category_converter.convert_categories_to_friendly_names(
544578
input_file, output_file, PATHS["data"]
545579
)

0 commit comments

Comments
 (0)