Skip to content

Fetch Data

Fetch Data #470

Workflow file for this run

name: Fetch Data
on:
schedule:
# Normal schedule
# # at 03:15 on all days in first month of each quarter
- cron: '15 3 * 1,4,7,10 *'
# # at 03:15 on days 1-14 in second month of each quarter
- cron: '15 3 1-14 2,5,8,11 *'
workflow_dispatch:
jobs:
fetch:
runs-on: ubuntu-latest
steps:
# CC Technology team members:
# See cc-quantifying-bot GitHub entry in Bitwarden for information on
# BOT_ secrets
- name: Configure git
run: |
git config --global init.defaultBranch main
git config --global user.name "${{ secrets.BOT_NAME }}"
git config --global user.email "${{ secrets.BOT_EMAIL }}"
- name: Checkout repository
uses: actions/checkout@v6
with:
# Default fetch-depth is 1, however that value results in errors
# when GitPython attempts to push changes:
# "failed to push some refs"
fetch-depth: 0
token: ${{ secrets.BOT_TOKEN }}
- name: Set up Python
uses: actions/setup-python@v6
with:
python-version: '3.11'
- name: Install Python dependencies
run: |
pip install --upgrade pip pipenv
- name: Sync Python modules
run: |
pipenv sync --system
# Fetch from arXiv disabled due to long run time (~6 hours)
#
# For now, data is fetched manually :/
# Fetch from Europeana disabled due API limitations
# https://github.com/creativecommons/quantifying/issues/224
# Fetch from GCS disabled due to Google blocking GitHub Action runners
# # CC Technology team members:
# # See cc-quantifying-bot Google Workspace entry in Bitwarden for
# # information on GCS_ secrets
# - name: Fetch from Google Custom Search (GCS)
# run: |
# ./scripts/1-fetch/gcs_fetch.py \
# --limit=100 --enable-save --enable-git
# env:
# GCS_DEVELOPER_KEY: ${{ secrets.GCS_DEVELOPER_KEY }}
# GCS_CX: ${{ secrets.GCS_CX }}
#
# For now, data is fetched manually :/
- name: Fetch from GitHub
run: |
./scripts/1-fetch/github_fetch.py \
--enable-save --enable-git
env:
GH_TOKEN: ${{ secrets.BOT_TOKEN }}
# Fetch from Openverse disabled due to API limitations
# https://github.com/creativecommons/quantifying/issues/184
- name: Fetch from Smithsonian
run: |
./scripts/1-fetch/smithsonian_fetch.py \
--enable-save --enable-git
env:
DATA_GOV_API_KEY: ${{ secrets.DATA_GOV_API_KEY }}
- name: Fetch from Wikipedia
run: |
./scripts/1-fetch/wikipedia_fetch.py \
--enable-save --enable-git