Skip to content

automate data/cleaned/math_arxiv_snapshot.csv generation #2

@dopper

Description

@dopper

Hi Brian,
Consider adding the following helper script to automate the generation of the data file:

scripts/parse_arxiv_to_csv.py
#!/usr/bin/env python3
"""
Parse arXiv JSON snapshot and extract mathematics papers to CSV format.
"""

import json
import csv
import sys
from pathlib import Path
import argparse

def is_math_paper(categories):
"""Check if paper belongs to mathematics category."""
if not categories:
return False

# Split categories by space and check if any start with 'math'
category_list = categories.split()
return any(cat.startswith('math.') or cat == 'math' for cat in category_list)

def parse_arxiv_json(input_file, output_file):
"""Parse arXiv JSON file and extract math papers to CSV."""

# Define the columns we need (as specified in README)
fieldnames = ['id', 'authors', 'title', 'categories', 'abstract', 'update_date', 'authors_parsed']

# Count statistics
total_papers = 0
math_papers = 0

print(f"Processing {input_file}...")

with open(input_file, 'r', encoding='utf-8') as infile, \
     open(output_file, 'w', newline='', encoding='utf-8') as outfile:
    
    # Create CSV writer
    writer = csv.DictWriter(outfile, fieldnames=fieldnames)
    writer.writeheader()
    
    # Process each line (JSON object)
    for line_num, line in enumerate(infile, 1):
        if line_num % 10000 == 0:
            print(f"  Processed {line_num:,} lines, found {math_papers:,} math papers...")
        
        try:
            # Parse JSON
            paper = json.loads(line.strip())
            total_papers += 1
            
            # Check if it's a math paper
            categories = paper.get('categories', '')
            if is_math_paper(categories):
                math_papers += 1
                
                # Extract required fields
                row = {
                    'id': paper.get('id', ''),
                    'authors': paper.get('authors', ''),
                    'title': paper.get('title', '').replace('\n', ' ').strip(),
                    'categories': categories,
                    'abstract': paper.get('abstract', '').replace('\n', ' ').strip(),
                    'update_date': paper.get('update_date', ''),
                    'authors_parsed': json.dumps(paper.get('authors_parsed', []))
                }
                
                writer.writerow(row)
                
        except json.JSONDecodeError as e:
            print(f"  Warning: Failed to parse JSON on line {line_num}: {e}")
        except Exception as e:
            print(f"  Warning: Error processing line {line_num}: {e}")

print(f"\nProcessing complete!")
print(f"Total papers processed: {total_papers:,}")
print(f"Mathematics papers found: {math_papers:,}")
print(f"Output saved to: {output_file}")

def main():
parser = argparse.ArgumentParser(
description='Parse arXiv JSON snapshot and extract mathematics papers to CSV.'
)
parser.add_argument(
'input_file',
help='Path to the input JSON file (e.g., arxiv-metadata-oai-snapshot.json)'
)
parser.add_argument(
'-o', '--output',
default='data/cleaned/math_arxiv_snapshot.csv',
help='Output CSV file path (default: data/cleaned/math_arxiv_snapshot.csv)'
)

args = parser.parse_args()

# Ensure output directory exists
output_path = Path(args.output)
output_path.parent.mkdir(parents=True, exist_ok=True)

# Run the parser
parse_arxiv_json(args.input_file, args.output)

if name == 'main':
main()

README.md update
Add run: python scripts/parse_arxiv_to_csv.py data/raw/arxiv-metadata-oai-snapshot.json after the file download to generate the datafile.

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions