44"""
55# Standard library
66import csv
7+ import logging
78import os
9+ import sys
810
911# Third-party
1012import yaml
1113
14+ # Add scripts directory to path to import shared module
15+ sys .path .append (os .path .join (os .path .dirname (__file__ ), '..' , 'scripts' ))
16+
17+ logger = logging .getLogger (__name__ )
18+
1219
1320def load_category_mapping (data_dir ):
1421 """Load category code to label mapping from YAML file."""
1522 mapping_file = os .path .join (data_dir , "arxiv_category_map.yaml" )
1623
1724 if not os .path .exists (mapping_file ):
25+ logger .warning (f"Category mapping file not found: { mapping_file } " )
1826 return {}
1927
2028 try :
2129 with open (mapping_file , "r" ) as f :
2230 return yaml .safe_load (f ) or {}
23- except Exception :
31+ except (yaml .YAMLError , IOError ) as e :
32+ logger .error (f"Failed to load category mapping from { mapping_file } : { e } " )
2433 return {}
2534
2635
@@ -34,39 +43,47 @@ def convert_categories_to_friendly_names(input_file, output_file, data_dir):
3443 data_dir: Directory containing arxiv_category_map.yaml
3544 """
3645 if not os .path .exists (input_file ):
46+ logger .error (f"Input file not found: { input_file } " )
3747 return
3848
3949 # Load category mapping
4050 category_mapping = load_category_mapping (data_dir )
51+ logger .info (f"Loaded { len (category_mapping )} category mappings" )
52+
53+ try :
54+ with (
55+ open (input_file , "r" ) as infile ,
56+ open (output_file , "w" , newline = "" ) as outfile ,
57+ ):
58+ reader = csv .DictReader (infile )
59+
60+ # Create new fieldnames with both code and label
61+ fieldnames = []
62+ for field in reader .fieldnames :
63+ fieldnames .append (field )
64+ if field == "CATEGORY" :
65+ fieldnames .append ("CATEGORY_LABEL" )
66+
67+ writer = csv .DictWriter (outfile , fieldnames = fieldnames , dialect = "unix" )
68+ writer .writeheader ()
69+
70+ for row in reader :
71+ if "CATEGORY" in row :
72+ category_code = row ["CATEGORY" ]
73+ # Convert code to label, fallback to uppercase first part
74+ category_label = category_mapping .get (
75+ category_code ,
76+ (
77+ category_code .split ("." )[0 ].upper ()
78+ if category_code and "." in category_code
79+ else category_code
80+ ),
81+ )
82+ row ["CATEGORY_LABEL" ] = category_label
4183
42- with (
43- open (input_file , "r" ) as infile ,
44- open (output_file , "w" , newline = "" ) as outfile ,
45- ):
46- reader = csv .DictReader (infile )
47-
48- # Create new fieldnames with both code and label
49- fieldnames = []
50- for field in reader .fieldnames :
51- fieldnames .append (field )
52- if field == "CATEGORY" :
53- fieldnames .append ("CATEGORY_LABEL" )
54-
55- writer = csv .DictWriter (outfile , fieldnames = fieldnames , dialect = "unix" )
56- writer .writeheader ()
57-
58- for row in reader :
59- if "CATEGORY" in row :
60- category_code = row ["CATEGORY" ]
61- # Convert code to label, fallback to uppercase first part
62- category_label = category_mapping .get (
63- category_code ,
64- (
65- category_code .split ("." )[0 ].upper ()
66- if category_code and "." in category_code
67- else category_code
68- ),
69- )
70- row ["CATEGORY_LABEL" ] = category_label
71-
72- writer .writerow (row )
84+ writer .writerow (row )
85+
86+ logger .info (f"Successfully converted categories: { input_file } -> { output_file } " )
87+
88+ except (IOError , csv .Error ) as e :
89+ logger .error (f"Failed to process CSV files: { e } " )
0 commit comments