Skip to content

Commit 0aa919c

Browse files
committed
Update arxiv_fetch.py
1 parent 267105b commit 0aa919c

1 file changed

Lines changed: 21 additions & 11 deletions

File tree

scripts/1-fetch/arxiv_fetch.py

Lines changed: 21 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -191,7 +191,7 @@ def parse_arguments():
191191
"--limit",
192192
type=int,
193193
default=DEFAULT_FETCH_LIMIT,
194-
help=f"Limit number of papers to fetch (default: {DEFAULT_FETCH_LIMIT})",
194+
help=f"Limit papers to fetch (default: {DEFAULT_FETCH_LIMIT})",
195195
)
196196
parser.add_argument(
197197
"--enable-save",
@@ -212,7 +212,7 @@ def parse_arguments():
212212
def initialize_data_file(file_path, headers):
213213
"""Initialize CSV file with headers if it doesn't exist."""
214214
if not os.path.isfile(file_path):
215-
with open(file_path, "w", newline="") as file_obj:
215+
with open(file_path, "w", newline="", encoding="utf-8") as file_obj:
216216
writer = csv.DictWriter(
217217
file_obj, fieldnames=headers, dialect="unix"
218218
)
@@ -295,7 +295,9 @@ def extract_year_from_entry(entry):
295295
try:
296296
return entry.published[:4] # Extract year from date string
297297
except (AttributeError, IndexError) as e:
298-
LOGGER.debug(f"Failed to extract year from entry.published '{entry.published}': {e}")
298+
LOGGER.debug(
299+
f"Failed to extract year from '{entry.published}': {e}"
300+
)
299301
return "Unknown"
300302

301303

@@ -334,14 +336,14 @@ def save_count_data(
334336
# author_counts: {license: {author_count(int|None): count}}
335337

336338
# Save license counts
337-
with open(FILE_ARXIV_COUNT, "w", newline="") as fh:
339+
with open(FILE_ARXIV_COUNT, "w", newline="", encoding="utf-8") as fh:
338340
writer = csv.DictWriter(fh, fieldnames=HEADER_COUNT, dialect="unix")
339341
writer.writeheader()
340342
for lic, c in license_counts.items():
341343
writer.writerow({"TOOL_IDENTIFIER": lic, "COUNT": c})
342344

343345
# Save detailed category counts (code)
344-
with open(FILE_ARXIV_CATEGORY, "w", newline="") as fh:
346+
with open(FILE_ARXIV_CATEGORY, "w", newline="", encoding="utf-8") as fh:
345347
writer = csv.DictWriter(fh, fieldnames=HEADER_CATEGORY, dialect="unix")
346348
writer.writeheader()
347349
for lic, cats in category_counts.items():
@@ -351,7 +353,9 @@ def save_count_data(
351353
)
352354

353355
# Save category report with labels and percent
354-
with open(FILE_ARXIV_CATEGORY_REPORT, "w", newline="") as fh:
356+
with open(
357+
FILE_ARXIV_CATEGORY_REPORT, "w", newline="", encoding="utf-8"
358+
) as fh:
355359
writer = csv.DictWriter(
356360
fh, fieldnames=HEADER_CATEGORY_REPORT, dialect="unix"
357361
)
@@ -380,7 +384,9 @@ def save_count_data(
380384

381385
# Save aggregated category report (top N per license, rest -> Other)
382386
TOP_N = 10
383-
with open(FILE_ARXIV_CATEGORY_REPORT_AGGREGATE, "w", newline="") as fh:
387+
with open(
388+
FILE_ARXIV_CATEGORY_REPORT_AGGREGATE, "w", newline="", encoding="utf-8"
389+
) as fh:
384390
writer = csv.DictWriter(
385391
fh,
386392
fieldnames=[
@@ -430,7 +436,7 @@ def save_count_data(
430436
)
431437

432438
# Save year counts
433-
with open(FILE_ARXIV_YEAR, "w", newline="") as fh:
439+
with open(FILE_ARXIV_YEAR, "w", newline="", encoding="utf-8") as fh:
434440
writer = csv.DictWriter(fh, fieldnames=HEADER_YEAR, dialect="unix")
435441
writer.writeheader()
436442
for lic, years in year_counts.items():
@@ -440,7 +446,7 @@ def save_count_data(
440446
)
441447

442448
# Save detailed author counts (AUTHOR_COUNT as integer or Unknown)
443-
with open(FILE_ARXIV_AUTHOR, "w", newline="") as fh:
449+
with open(FILE_ARXIV_AUTHOR, "w", newline="", encoding="utf-8") as fh:
444450
writer = csv.DictWriter(fh, fieldnames=HEADER_AUTHOR, dialect="unix")
445451
writer.writeheader()
446452
for lic, acs in author_counts.items():
@@ -454,7 +460,9 @@ def save_count_data(
454460
)
455461

456462
# Save author buckets summary
457-
with open(FILE_ARXIV_AUTHOR_BUCKET, "w", newline="") as fh:
463+
with open(
464+
FILE_ARXIV_AUTHOR_BUCKET, "w", newline="", encoding="utf-8"
465+
) as fh:
458466
writer = csv.DictWriter(
459467
fh, fieldnames=HEADER_AUTHOR_BUCKET, dialect="unix"
460468
)
@@ -503,7 +511,9 @@ def query_arxiv(args):
503511
consecutive_empty_calls = 0
504512

505513
for start in range(
506-
0, min(args.limit - total_fetched, MAX_RESULTS_PER_QUERY), results_per_iteration
514+
0,
515+
min(args.limit - total_fetched, MAX_RESULTS_PER_QUERY),
516+
results_per_iteration,
507517
):
508518
encoded_query = urllib.parse.quote_plus(search_query)
509519
query = (

0 commit comments

Comments
 (0)