|
3 | 3 | ArXiv category code to user-friendly name converter. |
4 | 4 | Called by arxiv_fetch.py to convert category codes to readable names. |
5 | 5 | """ |
| 6 | +# Standard library |
6 | 7 | import csv |
7 | 8 | import os |
| 9 | + |
| 10 | +# Third-party |
8 | 11 | import yaml |
9 | 12 |
|
| 13 | + |
10 | 14 | def load_category_mapping(data_dir): |
11 | 15 | """Load category code to label mapping from YAML file.""" |
12 | 16 | mapping_file = os.path.join(data_dir, "arxiv_category_map.yaml") |
13 | | - |
| 17 | + |
14 | 18 | if not os.path.exists(mapping_file): |
15 | 19 | return {} |
16 | | - |
| 20 | + |
17 | 21 | try: |
18 | | - with open(mapping_file, 'r') as f: |
| 22 | + with open(mapping_file, "r") as f: |
19 | 23 | return yaml.safe_load(f) or {} |
20 | 24 | except Exception: |
21 | 25 | return {} |
22 | 26 |
|
| 27 | + |
23 | 28 | def convert_categories_to_friendly_names(input_file, output_file, data_dir): |
24 | 29 | """ |
25 | 30 | Convert category codes in CSV to user-friendly names. |
26 | | - |
| 31 | +
|
27 | 32 | Args: |
28 | 33 | input_file: Path to input CSV with category codes |
29 | 34 | output_file: Path to output CSV with friendly names |
30 | 35 | data_dir: Directory containing arxiv_category_map.yaml |
31 | 36 | """ |
32 | 37 | if not os.path.exists(input_file): |
33 | 38 | return |
34 | | - |
| 39 | + |
35 | 40 | # Load category mapping |
36 | 41 | category_mapping = load_category_mapping(data_dir) |
37 | | - |
38 | | - with open(input_file, 'r') as infile, open(output_file, 'w', newline='') as outfile: |
| 42 | + |
| 43 | + with ( |
| 44 | + open(input_file, "r") as infile, |
| 45 | + open(output_file, "w", newline="") as outfile, |
| 46 | + ): |
39 | 47 | reader = csv.DictReader(infile) |
40 | | - |
| 48 | + |
41 | 49 | # Create new fieldnames with both code and label |
42 | 50 | fieldnames = [] |
43 | 51 | for field in reader.fieldnames: |
44 | 52 | fieldnames.append(field) |
45 | | - if field == 'CATEGORY': |
46 | | - fieldnames.append('CATEGORY_LABEL') |
47 | | - |
48 | | - writer = csv.DictWriter(outfile, fieldnames=fieldnames, dialect='unix') |
| 53 | + if field == "CATEGORY": |
| 54 | + fieldnames.append("CATEGORY_LABEL") |
| 55 | + |
| 56 | + writer = csv.DictWriter(outfile, fieldnames=fieldnames, dialect="unix") |
49 | 57 | writer.writeheader() |
50 | | - |
| 58 | + |
51 | 59 | for row in reader: |
52 | | - if 'CATEGORY' in row: |
53 | | - category_code = row['CATEGORY'] |
54 | | - # Convert code to label, fallback to uppercase first part if not found |
| 60 | + if "CATEGORY" in row: |
| 61 | + category_code = row["CATEGORY"] |
| 62 | + # Convert code to label, fallback to uppercase first part |
55 | 63 | category_label = category_mapping.get( |
56 | 64 | category_code, |
57 | | - category_code.split('.')[0].upper() if category_code and '.' in category_code else category_code |
| 65 | + ( |
| 66 | + category_code.split(".")[0].upper() |
| 67 | + if category_code and "." in category_code |
| 68 | + else category_code |
| 69 | + ), |
58 | 70 | ) |
59 | | - row['CATEGORY_LABEL'] = category_label |
60 | | - |
| 71 | + row["CATEGORY_LABEL"] = category_label |
| 72 | + |
61 | 73 | writer.writerow(row) |
0 commit comments