Skip to content

Commit 35d868c

Browse files
committed
use CC Legal Tool metadata CSV for license mapping
1 parent 0d34226 commit 35d868c

1 file changed

Lines changed: 73 additions & 81 deletions

File tree

scripts/1-fetch/arxiv_fetch.py

Lines changed: 73 additions & 81 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
#!/usr/bin/env python
22
"""
3-
Fetch ArXiv papers with CC license information using OAI-PMH API.
3+
Fetch ArXiv articles with CC license information using OAI-PMH API.
4+
OAI-PMH: Open Archives Initiative Protocol for Metadata Havesting.
45
"""
56
# Standard library
67
import argparse
@@ -12,6 +13,7 @@
1213
import traceback
1314
import xml.etree.ElementTree as ET # XML parsing for OAI-PMH responses
1415
from collections import Counter, defaultdict
16+
from copy import copy
1517
from datetime import datetime, timezone
1618
from operator import itemgetter
1719

@@ -32,45 +34,8 @@
3234
LOGGER, PATHS = shared.setup(__file__)
3335

3436
# Constants
35-
# API Configuration - Updated to use OAI-PMH for structured license data
3637
BASE_URL = "https://oaipmh.arxiv.org/oai"
37-
# Implementation choice: Set to 1000 CC-licensed papers for balanced collection
38-
# This is NOT an ArXiv API requirement - ArXiv only requires "responsible" use
39-
# The 3-second delays between requests ensure compliance with OAI-PMH practices
40-
DEFAULT_FETCH_LIMIT = 1000 # Default total CC-licensed papers to fetch
41-
DEFAULT_YEARS_BACK = 5 # Default years to look back from current year
42-
43-
# CSV Headers
44-
HEADER_AUTHOR_BUCKET = ["TOOL_IDENTIFIER", "AUTHOR_BUCKET", "COUNT"]
45-
HEADER_CATEGORY_REPORT = [
46-
"TOOL_IDENTIFIER",
47-
"CATEGORY_CODE",
48-
"CATEGORY_LABEL",
49-
"COUNT",
50-
]
51-
HEADER_COUNT = ["TOOL_IDENTIFIER", "COUNT"]
52-
HEADER_YEAR = ["TOOL_IDENTIFIER", "YEAR", "COUNT"]
53-
54-
# License mapping for structured data from OAI-PMH
55-
LICENSE_MAPPING = {
56-
"http://creativecommons.org/licenses/by/3.0/": "CC BY 3.0",
57-
"http://creativecommons.org/licenses/by/4.0/": "CC BY 4.0",
58-
"http://creativecommons.org/licenses/by-nc/3.0/": "CC BY-NC 3.0",
59-
"http://creativecommons.org/licenses/by-nc/4.0/": "CC BY-NC 4.0",
60-
"http://creativecommons.org/licenses/by-nc-nd/3.0/": "CC BY-NC-ND 3.0",
61-
"http://creativecommons.org/licenses/by-nc-nd/4.0/": "CC BY-NC-ND 4.0",
62-
"http://creativecommons.org/licenses/by-nc-sa/3.0/": "CC BY-NC-SA 3.0",
63-
"http://creativecommons.org/licenses/by-nc-sa/4.0/": "CC BY-NC-SA 4.0",
64-
"http://creativecommons.org/licenses/by-nd/3.0/": "CC BY-ND 3.0",
65-
"http://creativecommons.org/licenses/by-nd/4.0/": "CC BY-ND 4.0",
66-
"http://creativecommons.org/licenses/by-sa/3.0/": "CC BY-SA 3.0",
67-
"http://creativecommons.org/licenses/by-sa/4.0/": "CC BY-SA 4.0",
68-
"http://creativecommons.org/licenses/publicdomain": "CC CERTIFICATION 1.0"
69-
" US",
70-
"http://creativecommons.org/publicdomain/zero/1.0/": "CC0 1.0",
71-
"http://creativecommons.org/share-your-work/public-domain/cc0/": "CC0",
72-
}
73-
38+
# Defaults should result in quick operation (not complete operation)
7439
# ArXiv Categories - manually curated from ArXiv official taxonomy
7540
# Source: https://arxiv.org/category_taxonomy
7641
CATEGORIES = {
@@ -230,8 +195,9 @@
230195
"nucl-th": "Nuclear Theory",
231196
"quant-ph": "Quantum Physics",
232197
}
233-
234-
# File Paths
198+
DEFAULT_FETCH_LIMIT = 1000
199+
DEFAULT_YEARS_BACK = 5
200+
# CSV file paths
235201
FILE_ARXIV_AUTHOR_BUCKET = shared.path_join(
236202
PATHS["data_1-fetch"], "arxiv_4_count_by_author_bucket.csv"
237203
)
@@ -242,54 +208,61 @@
242208
FILE_ARXIV_YEAR = shared.path_join(
243209
PATHS["data_1-fetch"], "arxiv_3_count_by_year.csv"
244210
)
245-
# records metadata for each run for audit, reproducibility, and provenance
246211
FILE_PROVENANCE = shared.path_join(
247212
PATHS["data_1-fetch"], "arxiv_provenance.yaml"
248213
)
249-
250-
# Runtime variables
214+
# CSV headers
215+
HEADER_AUTHOR_BUCKET = ["TOOL_IDENTIFIER", "AUTHOR_BUCKET", "COUNT"]
216+
HEADER_CATEGORY_REPORT = [
217+
"TOOL_IDENTIFIER",
218+
"CATEGORY_CODE",
219+
"CATEGORY_LABEL",
220+
"COUNT",
221+
]
222+
HEADER_COUNT = ["TOOL_IDENTIFIER", "COUNT"]
223+
HEADER_YEAR = ["TOOL_IDENTIFIER", "YEAR", "COUNT"]
251224
QUARTER = os.path.basename(PATHS["data_quarter"])
252225

253226

254227
# parsing arguments function
255228
def parse_arguments():
256229
"""Parse command-line options, returns parsed argument namespace.
257230
258-
Note: The --limit parameter sets the total number of papers to fetch.
231+
Note: The --limit parameter sets the total number of articles to fetch.
259232
The --years-back parameter limits harvesting to recent years where
260233
CC licensing is more common.
261234
"""
262235
LOGGER.info("Parsing command-line options")
263236
parser = argparse.ArgumentParser(description=__doc__)
237+
parser.add_argument(
238+
"--enable-save",
239+
action="store_true",
240+
help="Enable saving results",
241+
)
242+
parser.add_argument(
243+
"--enable-git",
244+
action="store_true",
245+
help="Enable git actions (fetch, merge, add, commit, and push)",
246+
)
264247
parser.add_argument(
265248
"--limit",
266249
type=int,
267250
default=DEFAULT_FETCH_LIMIT,
268251
help=(
269-
f"Total limit of papers to fetch (default: {DEFAULT_FETCH_LIMIT})."
270-
" Use a value of -1 to remove limit."
252+
"Limit number of fetched articles (default:"
253+
f" {DEFAULT_FETCH_LIMIT}). Use a value of -1 to remove limit."
271254
),
272255
)
273256
parser.add_argument(
274257
"--years-back",
275258
type=int,
276259
default=DEFAULT_YEARS_BACK,
277260
help=(
278-
"Number of years back from current year to harvest (default:"
261+
"Number of years back from current year to fetch (default:"
279262
f" {DEFAULT_YEARS_BACK}). Use a value of -1 to specify"
280263
" <earliestDatestamp>."
281264
),
282265
)
283-
parser.add_argument(
284-
"--enable-save",
285-
action="store_true",
286-
help="Enable saving results",
287-
)
288-
parser.add_argument(
289-
"--enable-git",
290-
action="store_true",
291-
help="Enable git actions (fetch, merge, add, commit, and push)",
292-
)
293266

294267
args = parser.parse_args()
295268
if not args.enable_save and args.enable_git:
@@ -339,6 +312,28 @@ def initialize_all_data_files(args):
339312
initialize_data_file(FILE_ARXIV_AUTHOR_BUCKET, HEADER_AUTHOR_BUCKET)
340313

341314

315+
def get_license_mapping():
316+
global LICENSE_MAPPING
317+
LOGGER.info("Loading CC Legal Tool metadata for license mapping")
318+
file_path = shared.path_join(PATHS["data"], "cc-legal-tools.csv")
319+
license_mapping = {}
320+
with open(file_path, "r", encoding="utf-8") as file_obj:
321+
rows = csv.DictReader(file_obj, dialect="unix")
322+
for row in rows:
323+
simple_url = row["CANONICAL_URL"].replace("https://", "")
324+
simple_url = simple_url.rstrip("/")
325+
identifier = row["IDENTIFIER"]
326+
license_mapping[simple_url] = identifier
327+
328+
# Add legacy entry
329+
simple_url = "creativecommons.org/licenses/publicdomain"
330+
license_mapping[simple_url] = "CERTIFICATION 1.0 US"
331+
332+
LICENSE_MAPPING = dict(
333+
sorted(license_mapping.items(), key=lambda item: item[1])
334+
)
335+
336+
342337
def extract_license_from_xml(record_xml):
343338
"""
344339
Extract CC license information from OAI-PMH XML record.
@@ -355,9 +350,12 @@ def extract_license_from_xml(record_xml):
355350

356351
if license_element is not None and license_element.text:
357352
license_url = license_element.text.strip()
353+
simple_url = copy(license_url).replace("http://", "")
354+
simple_url = simple_url.replace("https://", "")
355+
simple_url = simple_url.rstrip("/")
358356
# Check exact mapping first
359-
if license_url in LICENSE_MAPPING:
360-
return LICENSE_MAPPING[license_url]
357+
if simple_url in LICENSE_MAPPING:
358+
return LICENSE_MAPPING[simple_url]
361359
# Validate CC URLs more strictly
362360
elif "creativecommons.org" in license_url.lower():
363361
return f"CC (ambiguous): {license_url}"
@@ -424,17 +422,11 @@ def bucket_author_count(author_count):
424422

425423
def query_arxiv(args, session):
426424
"""
427-
Main function to query ArXiv OAI-PMH API and collect CC license data.
428-
429-
Uses structured license metadata from OAI-PMH instead of text search.
430-
Harvests papers from recent years to focus on CC-licensed content.
425+
Query ArXiv OAI-PMH API and return information about CC licensed articles.
431426
"""
432-
433-
LOGGER.info("Beginning to fetch results from ArXiv OAI-PMH API")
434-
435427
LOGGER.info(
436-
f"Harvesting papers from {args.from_date} onwards "
437-
f"({args.years_back} years back)"
428+
f"Querying articles from {args.from_date} onwards ({args.years_back}"
429+
" years back)"
438430
)
439431

440432
# Data structures for counting
@@ -444,6 +436,7 @@ def query_arxiv(args, session):
444436
author_counts = defaultdict(lambda: defaultdict(int))
445437

446438
total_fetched = 0
439+
cc_articles_found = 0
447440
resumption_token = None
448441

449442
# Proceed is set to False when limit reached or end of records (missing
@@ -501,12 +494,13 @@ def query_arxiv(args, session):
501494
if args.limit > 0 and args.limit <= total_fetched:
502495
proceed = False
503496
break
497+
total_fetched += 1
504498

505499
# Convert record to string for metadata extraction
506500
record_xml = ET.tostring(record, encoding="unicode")
507501
metadata = extract_metadata_from_xml(record_xml)
508502

509-
# Only process CC-licensed papers
503+
# Only process CC-licensed articles
510504
if metadata["license"].startswith("CC"):
511505
license_info = metadata["license"]
512506
category = metadata["category"]
@@ -525,11 +519,12 @@ def query_arxiv(args, session):
525519
# Count by author count and license
526520
author_counts[license_info][author_count] += 1
527521

528-
total_fetched += 1
529522
batch_cc_count += 1
523+
cc_articles_found += 1
530524

531525
LOGGER.info(
532-
f" Batch completed: {batch_cc_count} CC-licensed papers found"
526+
f" Batch CC licensed articles: {batch_cc_count}, Total"
527+
f" CC-licensed articles: {cc_articles_found}"
533528
)
534529

535530
# Check for resumption token
@@ -548,16 +543,13 @@ def query_arxiv(args, session):
548543
# https://info.arxiv.org/help/api/tou.html#rate-limits
549544
time.sleep(3)
550545

551-
LOGGER.info(f"Total papers with CC licenses fetched: {total_fetched}")
552-
553546
data = {
554547
"author_counts": author_counts,
555548
"category_counts": category_counts,
556549
"license_counts": license_counts,
557550
"year_counts": year_counts,
558551
}
559-
560-
return data, total_fetched
552+
return data, cc_articles_found
561553

562554

563555
def rows_to_csv(args, fieldnames, rows, file_path):
@@ -634,7 +626,7 @@ def write_data(args, data):
634626
rows_to_csv(args, HEADER_YEAR, rows, FILE_ARXIV_YEAR)
635627

636628

637-
def write_provence(args, total_fetched):
629+
def write_provence(args, cc_articles_found):
638630
"""
639631
Write provenance information to YAML file.
640632
"""
@@ -643,7 +635,7 @@ def write_provence(args, total_fetched):
643635

644636
# Save provenance
645637
provenance_data = {
646-
"total_fetched": total_fetched,
638+
"cc_articles_found": cc_articles_found,
647639
"from_date": args.from_date,
648640
"years_back": args.years_back,
649641
"limit": args.limit,
@@ -667,15 +659,15 @@ def write_provence(args, total_fetched):
667659

668660
def main():
669661
"""Main function."""
670-
LOGGER.info("Script execution started.")
671662
args = parse_arguments()
672663
shared.paths_log(LOGGER, PATHS)
673664
shared.git_fetch_and_merge(args, PATHS["repo"])
674665
initialize_all_data_files(args)
666+
get_license_mapping()
675667
session = shared.get_session()
676-
data, total_fetched = query_arxiv(args, session)
668+
data, cc_articles_found = query_arxiv(args, session)
677669
write_data(args, data)
678-
write_provence(args, total_fetched)
670+
write_provence(args, cc_articles_found)
679671
args = shared.git_add_and_commit(
680672
args,
681673
PATHS["repo"],

0 commit comments

Comments
 (0)