automate data/cleaned/math_arxiv_snapshot.csv generation

Hi Brian,
Consider adding the following helper script to automate the generation of the data file:

scripts/parse_arxiv_to_csv.py
#!/usr/bin/env python3
"""
Parse arXiv JSON snapshot and extract mathematics papers to CSV format.
"""

import json
import csv
import sys
from pathlib import Path
import argparse


def is_math_paper(categories):
    """Check if paper belongs to mathematics category."""
    if not categories:
        return False
    
    # Split categories by space and check if any start with 'math'
    category_list = categories.split()
    return any(cat.startswith('math.') or cat == 'math' for cat in category_list)


def parse_arxiv_json(input_file, output_file):
    """Parse arXiv JSON file and extract math papers to CSV."""
    
    # Define the columns we need (as specified in README)
    fieldnames = ['id', 'authors', 'title', 'categories', 'abstract', 'update_date', 'authors_parsed']
    
    # Count statistics
    total_papers = 0
    math_papers = 0
    
    print(f"Processing {input_file}...")
    
    with open(input_file, 'r', encoding='utf-8') as infile, \
         open(output_file, 'w', newline='', encoding='utf-8') as outfile:
        
        # Create CSV writer
        writer = csv.DictWriter(outfile, fieldnames=fieldnames)
        writer.writeheader()
        
        # Process each line (JSON object)
        for line_num, line in enumerate(infile, 1):
            if line_num % 10000 == 0:
                print(f"  Processed {line_num:,} lines, found {math_papers:,} math papers...")
            
            try:
                # Parse JSON
                paper = json.loads(line.strip())
                total_papers += 1
                
                # Check if it's a math paper
                categories = paper.get('categories', '')
                if is_math_paper(categories):
                    math_papers += 1
                    
                    # Extract required fields
                    row = {
                        'id': paper.get('id', ''),
                        'authors': paper.get('authors', ''),
                        'title': paper.get('title', '').replace('\n', ' ').strip(),
                        'categories': categories,
                        'abstract': paper.get('abstract', '').replace('\n', ' ').strip(),
                        'update_date': paper.get('update_date', ''),
                        'authors_parsed': json.dumps(paper.get('authors_parsed', []))
                    }
                    
                    writer.writerow(row)
                    
            except json.JSONDecodeError as e:
                print(f"  Warning: Failed to parse JSON on line {line_num}: {e}")
            except Exception as e:
                print(f"  Warning: Error processing line {line_num}: {e}")
    
    print(f"\nProcessing complete!")
    print(f"Total papers processed: {total_papers:,}")
    print(f"Mathematics papers found: {math_papers:,}")
    print(f"Output saved to: {output_file}")


def main():
    parser = argparse.ArgumentParser(
        description='Parse arXiv JSON snapshot and extract mathematics papers to CSV.'
    )
    parser.add_argument(
        'input_file',
        help='Path to the input JSON file (e.g., arxiv-metadata-oai-snapshot.json)'
    )
    parser.add_argument(
        '-o', '--output',
        default='data/cleaned/math_arxiv_snapshot.csv',
        help='Output CSV file path (default: data/cleaned/math_arxiv_snapshot.csv)'
    )
    
    args = parser.parse_args()
    
    # Ensure output directory exists
    output_path = Path(args.output)
    output_path.parent.mkdir(parents=True, exist_ok=True)
    
    # Run the parser
    parse_arxiv_json(args.input_file, args.output)


if __name__ == '__main__':
    main()

README.md update
Add run: python scripts/parse_arxiv_to_csv.py data/raw/arxiv-metadata-oai-snapshot.json after the file download to generate the datafile.

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

automate data/cleaned/math_arxiv_snapshot.csv generation #2

Metadata

Assignees

Labels

Projects

Milestone

Relationships

Development

automate data/cleaned/math_arxiv_snapshot.csv generation #2

Description

Metadata

Metadata

Assignees

Labels

Projects

Milestone

Relationships

Development

Issue actions