Skip to content

Commit 0414859

Browse files
committed
Remove PERCENT column and aggregated category report generation
1 parent df6fe6b commit 0414859

1 file changed

Lines changed: 1 addition & 56 deletions

File tree

scripts/1-fetch/arxiv_fetch.py

Lines changed: 1 addition & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,6 @@
6767
"CATEGORY_CODE",
6868
"CATEGORY_LABEL",
6969
"COUNT",
70-
"PERCENT",
7170
]
7271
HEADER_YEAR = ["TOOL_IDENTIFIER", "YEAR", "COUNT"]
7372
HEADER_AUTHOR_BUCKET = ["TOOL_IDENTIFIER", "AUTHOR_BUCKET", "COUNT"]
@@ -258,9 +257,6 @@
258257
FILE_ARXIV_CATEGORY_REPORT = shared.path_join(
259258
PATHS["data_1-fetch"], "arxiv_2_count_by_category_report.csv"
260259
)
261-
FILE_ARXIV_CATEGORY_REPORT_AGGREGATE = shared.path_join(
262-
PATHS["data_1-fetch"], "arxiv_2_count_by_category_report_agg.csv"
263-
)
264260
FILE_ARXIV_YEAR = shared.path_join(
265261
PATHS["data_1-fetch"], "arxiv_3_count_by_year.csv"
266262
)
@@ -456,7 +452,7 @@ def save_count_data(
456452
for lic, c in license_counts.items():
457453
writer.writerow({"TOOL_IDENTIFIER": lic, "COUNT": c})
458454

459-
# Save category report with labels and percent
455+
# Save category report with labels
460456
with open(
461457
FILE_ARXIV_CATEGORY_REPORT, "w", newline="", encoding="utf-8"
462458
) as fh:
@@ -465,65 +461,14 @@ def save_count_data(
465461
)
466462
writer.writeheader()
467463
for lic, cats in category_counts.items():
468-
total_for_license = sum(cats.values()) or 1
469464
for code, c in cats.items():
470465
label = CATEGORIES.get(code, code)
471-
pct = round((c / total_for_license) * 100, 2)
472466
writer.writerow(
473467
{
474468
"TOOL_IDENTIFIER": lic,
475469
"CATEGORY_CODE": code,
476470
"CATEGORY_LABEL": label,
477471
"COUNT": c,
478-
"PERCENT": pct,
479-
}
480-
)
481-
482-
# Save aggregated category report (top N per license, rest -> Other)
483-
with open(
484-
FILE_ARXIV_CATEGORY_REPORT_AGGREGATE, "w", newline="", encoding="utf-8"
485-
) as fh:
486-
writer = csv.DictWriter(
487-
fh,
488-
fieldnames=[
489-
"TOOL_IDENTIFIER",
490-
"CATEGORY_CODE",
491-
"CATEGORY_LABEL",
492-
"COUNT",
493-
"PERCENT",
494-
],
495-
dialect="unix",
496-
)
497-
writer.writeheader()
498-
for lic, cats in category_counts.items():
499-
total_for_license = sum(cats.values()) or 1
500-
sorted_cats = sorted(
501-
cats.items(), key=lambda x: x[1], reverse=True
502-
)
503-
top = sorted_cats[:10]
504-
others = sorted_cats[10:]
505-
other_count = sum(c for _, c in others)
506-
for code, c in top:
507-
label = CATEGORIES.get(code, code)
508-
writer.writerow(
509-
{
510-
"TOOL_IDENTIFIER": lic,
511-
"CATEGORY_CODE": code,
512-
"CATEGORY_LABEL": label,
513-
"COUNT": c,
514-
"PERCENT": round((c / total_for_license) * 100, 2),
515-
}
516-
)
517-
if other_count:
518-
writer.writerow(
519-
{
520-
"TOOL_IDENTIFIER": lic,
521-
"CATEGORY_CODE": "OTHER",
522-
"CATEGORY_LABEL": "Other",
523-
"COUNT": other_count,
524-
"PERCENT": round(
525-
(other_count / total_for_license) * 100, 2
526-
),
527472
}
528473
)
529474

0 commit comments

Comments
 (0)