Skip to content

Commit 62cb7e3

Browse files
Add Flickr fetch script and workflow integration
Implements automated Flickr data collection following existing patterns. Includes all 8 CC licenses and integrates with 1-fetch workflow. Fixes #164
1 parent 19249f8 commit 62cb7e3

File tree

2 files changed

+223
-0
lines changed

2 files changed

+223
-0
lines changed

.github/workflows/1-fetch.yml

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,3 +67,14 @@ jobs:
6767
--enable-save --enable-git
6868
env:
6969
GH_TOKEN: ${{ secrets.BOT_TOKEN }}
70+
71+
# CC Technology team members:
72+
# See cc-quantifying-bot Flickr entry in Bitwarden for information
73+
# on FLICKR_ secrets
74+
- name: Fetch from Flickr
75+
run: |
76+
./scripts/1-fetch/flickr_fetch.py \
77+
--limit=100 --enable-save --enable-git
78+
env:
79+
FLICKR_API_KEY: ${{ secrets.FLICKR_API_KEY }}
80+
FLICKR_API_SECRET: ${{ secrets.FLICKR_API_SECRET }}

scripts/1-fetch/flickr_fetch.py

Lines changed: 212 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,212 @@
1+
#!/usr/bin/env python
2+
"""
3+
Fetch CC photo license data from Flickr API for quarterly analysis.
4+
"""
5+
import argparse
6+
import csv
7+
import json
8+
import os
9+
import sys
10+
import time
11+
import traceback
12+
13+
import flickrapi
14+
from dotenv import load_dotenv
15+
from pygments import highlight
16+
from pygments.formatters import TerminalFormatter
17+
from pygments.lexers import PythonTracebackLexer
18+
19+
sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
20+
import shared # noqa: E402
21+
22+
LOGGER, PATHS = shared.setup(__file__)
23+
load_dotenv(PATHS["dotenv"])
24+
25+
FLICKR_API_KEY = os.getenv("FLICKR_API_KEY")
26+
FLICKR_API_SECRET = os.getenv("FLICKR_API_SECRET")
27+
FILE1_COUNT = os.path.join(PATHS["data_phase"], "flickr_1_count.csv")
28+
HEADER1_COUNT = ["LICENSE_ID", "LICENSE_NAME", "COUNT"]
29+
QUARTER = os.path.basename(PATHS["data_quarter"])
30+
31+
# https://www.flickr.com/services/api/flickr.photos.licenses.getInfo.html
32+
FLICKR_LICENSES = {
33+
1: "CC BY-NC-SA 2.0",
34+
2: "CC BY-NC 2.0",
35+
3: "CC BY-NC-ND 2.0",
36+
4: "CC BY 2.0",
37+
5: "CC BY-SA 2.0",
38+
6: "CC BY-ND 2.0",
39+
9: "Public Domain Dedication (CC0)",
40+
10: "Public Domain Mark",
41+
}
42+
43+
CC_LICENSES = [1, 2, 3, 4, 5, 6, 9, 10]
44+
45+
LOGGER.info("Script execution started.")
46+
47+
48+
def parse_arguments():
49+
LOGGER.info("Parsing command-line options")
50+
parser = argparse.ArgumentParser(description=__doc__)
51+
parser.add_argument(
52+
"--limit",
53+
type=int,
54+
default=100,
55+
help="Limit number of photos per license (default: 100)",
56+
)
57+
parser.add_argument(
58+
"--enable-save",
59+
action="store_true",
60+
help="Enable saving results",
61+
)
62+
parser.add_argument(
63+
"--enable-git",
64+
action="store_true",
65+
help="Enable git actions (fetch, merge, add, commit, and push)",
66+
)
67+
parser.add_argument(
68+
"--dev",
69+
action="store_true",
70+
help="Development mode: generate fake data without API calls",
71+
)
72+
args = parser.parse_args()
73+
if not args.enable_save and args.enable_git:
74+
parser.error("--enable-git requires --enable-save")
75+
return args
76+
77+
78+
def get_flickr_api():
79+
LOGGER.info("Setting up Flickr API")
80+
81+
if not FLICKR_API_KEY or not FLICKR_API_SECRET:
82+
raise shared.QuantifyingException(
83+
"Missing Flickr API credentials. Check your .env file."
84+
)
85+
86+
return flickrapi.FlickrAPI(
87+
FLICKR_API_KEY,
88+
FLICKR_API_SECRET,
89+
format="json",
90+
)
91+
92+
93+
def fetch_license_count(flickr, license_id, limit=100):
94+
"""Fetch photo count for a specific license from Flickr API."""
95+
license_name = FLICKR_LICENSES.get(license_id, "Unknown")
96+
LOGGER.info(f"Fetching count for license {license_id}: {license_name}")
97+
98+
try:
99+
photos_json = flickr.photos.search(
100+
license=license_id, per_page=min(limit, 500), page=1
101+
)
102+
103+
photos_data = json.loads(photos_json.decode("utf-8"))
104+
105+
if "photos" in photos_data and "total" in photos_data["photos"]:
106+
total = int(photos_data["photos"]["total"])
107+
count = min(total, limit)
108+
LOGGER.info(f" Found {count} photos (total available: {total})")
109+
return count
110+
else:
111+
LOGGER.warning(f" No data returned for license {license_id}")
112+
return 0
113+
114+
except Exception as e:
115+
LOGGER.error(f" Failed to fetch count for license {license_id}: {e}")
116+
return 0
117+
118+
119+
def generate_fake_data(args):
120+
"""Generate fake data for dev mode."""
121+
LOGGER.info("Creating fake data for dev mode")
122+
123+
counts = {}
124+
base = args.limit // len(CC_LICENSES)
125+
for idx, license_id in enumerate(CC_LICENSES):
126+
counts[license_id] = base + (license_id * 10) + (idx * 5)
127+
128+
return counts
129+
130+
131+
def save_data(args, license_counts):
132+
"""Save license count data to CSV file."""
133+
if not args.enable_save:
134+
LOGGER.info("Save disabled, skipping file write")
135+
return
136+
137+
LOGGER.info(f"Writing data to {FILE1_COUNT}")
138+
139+
data_rows = []
140+
for license_id, count in license_counts.items():
141+
data_rows.append(
142+
{
143+
"LICENSE_ID": license_id,
144+
"LICENSE_NAME": FLICKR_LICENSES[license_id],
145+
"COUNT": count,
146+
}
147+
)
148+
149+
data_rows.sort(key=lambda x: x["LICENSE_ID"])
150+
151+
with open(FILE1_COUNT, "w", newline="", encoding="utf-8") as csvfile:
152+
writer = csv.DictWriter(csvfile, fieldnames=HEADER1_COUNT)
153+
writer.writeheader()
154+
writer.writerows(data_rows)
155+
156+
LOGGER.info(f"Successfully wrote {len(data_rows)} records")
157+
158+
159+
def main():
160+
try:
161+
args = parse_arguments()
162+
163+
if args.enable_git:
164+
shared.git_fetch_and_merge(args, PATHS["repo"])
165+
166+
license_counts = {}
167+
168+
if args.dev:
169+
license_counts = generate_fake_data(args)
170+
else:
171+
flickr = get_flickr_api()
172+
173+
for license_id in CC_LICENSES:
174+
count = fetch_license_count(flickr, license_id, args.limit)
175+
license_counts[license_id] = count
176+
time.sleep(0.1)
177+
178+
save_data(args, license_counts)
179+
180+
if args.enable_git:
181+
args = shared.git_add_and_commit(
182+
args,
183+
PATHS["repo"],
184+
PATHS["data_quarter"],
185+
f"Add Flickr data for {QUARTER}",
186+
)
187+
shared.git_push_changes(args, PATHS["repo"])
188+
189+
total_photos = sum(license_counts.values())
190+
LOGGER.info(f"Done. Total photos across all licenses: {total_photos}")
191+
192+
for license_id in sorted(license_counts.keys()):
193+
count = license_counts[license_id]
194+
license_name = FLICKR_LICENSES[license_id]
195+
LOGGER.info(f" License {license_id} ({license_name}): {count} photos")
196+
197+
except shared.QuantifyingException as e:
198+
LOGGER.error(f"Error: {e}")
199+
sys.exit(1)
200+
except Exception as e:
201+
LOGGER.error(f"Unexpected error: {e}")
202+
if LOGGER.isEnabledFor(10):
203+
traceback_str = traceback.format_exc()
204+
highlighted_traceback = highlight(
205+
traceback_str, PythonTracebackLexer(), TerminalFormatter()
206+
)
207+
print(highlighted_traceback)
208+
sys.exit(1)
209+
210+
211+
if __name__ == "__main__":
212+
main()

0 commit comments

Comments
 (0)