Skip to content

Commit a544cab

Browse files
committed
Add category converter in /dev called by arxiv_fetch.py to generate user-friendly names
1 parent 70d191f commit a544cab

2 files changed

Lines changed: 75 additions & 0 deletions

File tree

dev/arxiv_category_converter.py

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
#!/usr/bin/env python
2+
"""
3+
ArXiv category code to user-friendly name converter.
4+
Called by arxiv_fetch.py to convert category codes to readable names.
5+
"""
6+
import csv
7+
import os
8+
import yaml
9+
10+
def load_category_mapping(data_dir):
11+
"""Load category code to label mapping from YAML file."""
12+
mapping_file = os.path.join(data_dir, "arxiv_category_map.yaml")
13+
14+
if not os.path.exists(mapping_file):
15+
return {}
16+
17+
try:
18+
with open(mapping_file, 'r') as f:
19+
return yaml.safe_load(f) or {}
20+
except Exception:
21+
return {}
22+
23+
def convert_categories_to_friendly_names(input_file, output_file, data_dir):
24+
"""
25+
Convert category codes in CSV to user-friendly names.
26+
27+
Args:
28+
input_file: Path to input CSV with category codes
29+
output_file: Path to output CSV with friendly names
30+
data_dir: Directory containing arxiv_category_map.yaml
31+
"""
32+
if not os.path.exists(input_file):
33+
return
34+
35+
# Load category mapping
36+
category_mapping = load_category_mapping(data_dir)
37+
38+
with open(input_file, 'r') as infile, open(output_file, 'w', newline='') as outfile:
39+
reader = csv.DictReader(infile)
40+
41+
# Create new fieldnames with both code and label
42+
fieldnames = []
43+
for field in reader.fieldnames:
44+
fieldnames.append(field)
45+
if field == 'CATEGORY':
46+
fieldnames.append('CATEGORY_LABEL')
47+
48+
writer = csv.DictWriter(outfile, fieldnames=fieldnames, dialect='unix')
49+
writer.writeheader()
50+
51+
for row in reader:
52+
if 'CATEGORY' in row:
53+
category_code = row['CATEGORY']
54+
# Convert code to label, fallback to uppercase first part if not found
55+
category_label = category_mapping.get(
56+
category_code,
57+
category_code.split('.')[0].upper() if category_code and '.' in category_code else category_code
58+
)
59+
row['CATEGORY_LABEL'] = category_label
60+
61+
writer.writerow(row)

scripts/1-fetch/arxiv_fetch.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,9 +28,12 @@
2828

2929
# Add parent directory so shared can be imported
3030
sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
31+
# Add dev directory for category converter
32+
sys.path.append(os.path.join(os.path.dirname(__file__), "..", "..", "dev"))
3133

3234
# First-party/Local
3335
import shared # noqa: E402
36+
import arxiv_category_converter # noqa: E402
3437

3538
# Setup
3639
LOGGER, PATHS = shared.setup(__file__)
@@ -532,6 +535,17 @@ def query_arxiv(args):
532535
save_count_data(
533536
license_counts, category_counts, year_counts, author_counts
534537
)
538+
539+
# Convert category codes to user-friendly names
540+
try:
541+
input_file = FILE_ARXIV_CATEGORY
542+
output_file = shared.path_join(PATHS["data"], "arxiv_2_count_by_category_converted.csv")
543+
arxiv_category_converter.convert_categories_to_friendly_names(
544+
input_file, output_file, PATHS["data"]
545+
)
546+
LOGGER.info(f"Category conversion completed: {output_file}")
547+
except Exception as e:
548+
LOGGER.warning(f"Category conversion failed: {e}")
535549

536550
# save provenance
537551
provenance_data = {

0 commit comments

Comments
 (0)