Hi Brian,
Consider adding the following helper script to automate the generation of the data file:
scripts/parse_arxiv_to_csv.py
#!/usr/bin/env python3
"""
Parse arXiv JSON snapshot and extract mathematics papers to CSV format.
"""
import json
import csv
import sys
from pathlib import Path
import argparse
def is_math_paper(categories):
"""Check if paper belongs to mathematics category."""
if not categories:
return False
# Split categories by space and check if any start with 'math'
category_list = categories.split()
return any(cat.startswith('math.') or cat == 'math' for cat in category_list)
def parse_arxiv_json(input_file, output_file):
"""Parse arXiv JSON file and extract math papers to CSV."""
# Define the columns we need (as specified in README)
fieldnames = ['id', 'authors', 'title', 'categories', 'abstract', 'update_date', 'authors_parsed']
# Count statistics
total_papers = 0
math_papers = 0
print(f"Processing {input_file}...")
with open(input_file, 'r', encoding='utf-8') as infile, \
open(output_file, 'w', newline='', encoding='utf-8') as outfile:
# Create CSV writer
writer = csv.DictWriter(outfile, fieldnames=fieldnames)
writer.writeheader()
# Process each line (JSON object)
for line_num, line in enumerate(infile, 1):
if line_num % 10000 == 0:
print(f" Processed {line_num:,} lines, found {math_papers:,} math papers...")
try:
# Parse JSON
paper = json.loads(line.strip())
total_papers += 1
# Check if it's a math paper
categories = paper.get('categories', '')
if is_math_paper(categories):
math_papers += 1
# Extract required fields
row = {
'id': paper.get('id', ''),
'authors': paper.get('authors', ''),
'title': paper.get('title', '').replace('\n', ' ').strip(),
'categories': categories,
'abstract': paper.get('abstract', '').replace('\n', ' ').strip(),
'update_date': paper.get('update_date', ''),
'authors_parsed': json.dumps(paper.get('authors_parsed', []))
}
writer.writerow(row)
except json.JSONDecodeError as e:
print(f" Warning: Failed to parse JSON on line {line_num}: {e}")
except Exception as e:
print(f" Warning: Error processing line {line_num}: {e}")
print(f"\nProcessing complete!")
print(f"Total papers processed: {total_papers:,}")
print(f"Mathematics papers found: {math_papers:,}")
print(f"Output saved to: {output_file}")
def main():
parser = argparse.ArgumentParser(
description='Parse arXiv JSON snapshot and extract mathematics papers to CSV.'
)
parser.add_argument(
'input_file',
help='Path to the input JSON file (e.g., arxiv-metadata-oai-snapshot.json)'
)
parser.add_argument(
'-o', '--output',
default='data/cleaned/math_arxiv_snapshot.csv',
help='Output CSV file path (default: data/cleaned/math_arxiv_snapshot.csv)'
)
args = parser.parse_args()
# Ensure output directory exists
output_path = Path(args.output)
output_path.parent.mkdir(parents=True, exist_ok=True)
# Run the parser
parse_arxiv_json(args.input_file, args.output)
if name == 'main':
main()
README.md update
Add run: python scripts/parse_arxiv_to_csv.py data/raw/arxiv-metadata-oai-snapshot.json after the file download to generate the datafile.
Hi Brian,
Consider adding the following helper script to automate the generation of the data file:
scripts/parse_arxiv_to_csv.py
#!/usr/bin/env python3
"""
Parse arXiv JSON snapshot and extract mathematics papers to CSV format.
"""
import json
import csv
import sys
from pathlib import Path
import argparse
def is_math_paper(categories):
"""Check if paper belongs to mathematics category."""
if not categories:
return False
def parse_arxiv_json(input_file, output_file):
"""Parse arXiv JSON file and extract math papers to CSV."""
def main():
parser = argparse.ArgumentParser(
description='Parse arXiv JSON snapshot and extract mathematics papers to CSV.'
)
parser.add_argument(
'input_file',
help='Path to the input JSON file (e.g., arxiv-metadata-oai-snapshot.json)'
)
parser.add_argument(
'-o', '--output',
default='data/cleaned/math_arxiv_snapshot.csv',
help='Output CSV file path (default: data/cleaned/math_arxiv_snapshot.csv)'
)
if name == 'main':
main()
README.md update
Add run: python scripts/parse_arxiv_to_csv.py data/raw/arxiv-metadata-oai-snapshot.json after the file download to generate the datafile.