6767 "CATEGORY_CODE" ,
6868 "CATEGORY_LABEL" ,
6969 "COUNT" ,
70- "PERCENT" ,
7170]
7271HEADER_YEAR = ["TOOL_IDENTIFIER" , "YEAR" , "COUNT" ]
7372HEADER_AUTHOR_BUCKET = ["TOOL_IDENTIFIER" , "AUTHOR_BUCKET" , "COUNT" ]
258257FILE_ARXIV_CATEGORY_REPORT = shared .path_join (
259258 PATHS ["data_1-fetch" ], "arxiv_2_count_by_category_report.csv"
260259)
261- FILE_ARXIV_CATEGORY_REPORT_AGGREGATE = shared .path_join (
262- PATHS ["data_1-fetch" ], "arxiv_2_count_by_category_report_agg.csv"
263- )
264260FILE_ARXIV_YEAR = shared .path_join (
265261 PATHS ["data_1-fetch" ], "arxiv_3_count_by_year.csv"
266262)
@@ -456,7 +452,7 @@ def save_count_data(
456452 for lic , c in license_counts .items ():
457453 writer .writerow ({"TOOL_IDENTIFIER" : lic , "COUNT" : c })
458454
459- # Save category report with labels and percent
455+ # Save category report with labels
460456 with open (
461457 FILE_ARXIV_CATEGORY_REPORT , "w" , newline = "" , encoding = "utf-8"
462458 ) as fh :
@@ -465,65 +461,14 @@ def save_count_data(
465461 )
466462 writer .writeheader ()
467463 for lic , cats in category_counts .items ():
468- total_for_license = sum (cats .values ()) or 1
469464 for code , c in cats .items ():
470465 label = CATEGORIES .get (code , code )
471- pct = round ((c / total_for_license ) * 100 , 2 )
472466 writer .writerow (
473467 {
474468 "TOOL_IDENTIFIER" : lic ,
475469 "CATEGORY_CODE" : code ,
476470 "CATEGORY_LABEL" : label ,
477471 "COUNT" : c ,
478- "PERCENT" : pct ,
479- }
480- )
481-
482- # Save aggregated category report (top N per license, rest -> Other)
483- with open (
484- FILE_ARXIV_CATEGORY_REPORT_AGGREGATE , "w" , newline = "" , encoding = "utf-8"
485- ) as fh :
486- writer = csv .DictWriter (
487- fh ,
488- fieldnames = [
489- "TOOL_IDENTIFIER" ,
490- "CATEGORY_CODE" ,
491- "CATEGORY_LABEL" ,
492- "COUNT" ,
493- "PERCENT" ,
494- ],
495- dialect = "unix" ,
496- )
497- writer .writeheader ()
498- for lic , cats in category_counts .items ():
499- total_for_license = sum (cats .values ()) or 1
500- sorted_cats = sorted (
501- cats .items (), key = lambda x : x [1 ], reverse = True
502- )
503- top = sorted_cats [:10 ]
504- others = sorted_cats [10 :]
505- other_count = sum (c for _ , c in others )
506- for code , c in top :
507- label = CATEGORIES .get (code , code )
508- writer .writerow (
509- {
510- "TOOL_IDENTIFIER" : lic ,
511- "CATEGORY_CODE" : code ,
512- "CATEGORY_LABEL" : label ,
513- "COUNT" : c ,
514- "PERCENT" : round ((c / total_for_license ) * 100 , 2 ),
515- }
516- )
517- if other_count :
518- writer .writerow (
519- {
520- "TOOL_IDENTIFIER" : lic ,
521- "CATEGORY_CODE" : "OTHER" ,
522- "CATEGORY_LABEL" : "Other" ,
523- "COUNT" : other_count ,
524- "PERCENT" : round (
525- (other_count / total_for_license ) * 100 , 2
526- ),
527472 }
528473 )
529474
0 commit comments