Skip to content

Commit 1daba78

Browse files
authored
Merge pull request #278 from creativecommons/process_report
Add Smithsonian process and report script
2 parents 121c971 + 071c3b8 commit 1daba78

File tree

4 files changed

+765
-9
lines changed

4 files changed

+765
-9
lines changed

scripts/1-fetch/smithsonian_fetch.py

Lines changed: 69 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -39,13 +39,77 @@
3939
"TOTAL_OBJECTS",
4040
]
4141
HEADER_2_UNITS = [
42-
"UNIT",
42+
"UNIT_CODE",
43+
"DATA_SOURCE",
4344
"CC0_RECORDS",
4445
"CC0_RECORDS_WITH_CC0_MEDIA",
4546
"TOTAL_OBJECTS",
4647
]
4748
QUARTER = os.path.basename(PATHS["data_quarter"])
4849

50+
# Manually compiled unit code and name from URL
51+
# 'https://github.com/Smithsonian/OpenAccess'
52+
UNIT_MAP = {
53+
"AAA": "Archives of American Art",
54+
"AAG": "Archives of American Gardens",
55+
"ACM": "Anacostia Community Museum",
56+
"ACMA": "Anacostia Community Museum Archives",
57+
"CFCHFOLKLIFE": "Ralph Rinzler Folklife Archives and Collections",
58+
"CHNDM": "Cooper Hewitt, Smithsonian Design Museum",
59+
"FBR": "Smithsonian Field Book Project",
60+
"FSG": "Freer Gallery of Art and Arthur M. Sackler Gallery",
61+
"HAC": "Smithsonian Gardens",
62+
"HMSG": "Hirshhorn Museum and Sculpture Garden",
63+
"HSFA": "Human Studies Film Archives",
64+
"NASM": "National Air and Space Museum",
65+
"NMAAHC": "National Museum of African American History and Culture",
66+
"NMAH": "National Museum of American History",
67+
"NMAI": "National Museum of the American Indian",
68+
"NMAfA": "National Museum of African Art",
69+
"NMNHANTHRO": ("National Musuem of Natural History - Anthropology Dept."),
70+
"NMNHBIRDS": (
71+
"National Musuem of Natural History - Vertebrate Zoology - Birds"
72+
" Division"
73+
),
74+
"NMNHBOTANY": ("National Musuem of Natural History - Botany Dept."),
75+
"NMNHEDUCATION": (
76+
"National Musuem of Natural History - Education & Outreach"
77+
),
78+
"NMNHENTO": ("National Musuem of Natural History - Entomology Dept."),
79+
"NMNHFISHES": (
80+
"National Musuem of Natural History - Vertebrate Zoology - Fishes"
81+
" Division"
82+
),
83+
"NMNHHERPS": (
84+
"National Musuem of Natural History - Vertebrate Zoology - Herpetology"
85+
" Division"
86+
),
87+
"NMNHINV": (
88+
"National Musuem of Natural History - Invertebrate Zoology Dept."
89+
),
90+
"NMNHMAMMALS": (
91+
"National Musuem of Natural History"
92+
" - Vertebrate Zoology - Mammals Division"
93+
),
94+
"NMNHMINSCI": (
95+
"National Musuem of Natural History" " - Mineral Sciences Dept."
96+
),
97+
"NMNHPALEO": ("National Musuem of Natural History - Paleobiology Dept."),
98+
"NPG": "National Portrait Gallery",
99+
"NPM": "National Postal Museum",
100+
"NZP": "Smithsonian's National Zoo & Conservation Biology Institute",
101+
"OCIO_DPO3D": "OCIO Digital Preservation & 3D Team",
102+
"OFEO-SG": "Office of Facilities Engineering &"
103+
" Operations – Smithsonian Gardens",
104+
"SAAM": "Smithsonian American Art Museum",
105+
"SIA": "Smithsonian Institution Archives",
106+
"SIL": "Smithsonian Libraries",
107+
"SILAF": "Smithsonian Institution Libraries, African Section",
108+
"SILNMAHTL": "Smithsonian Institution Libraries,"
109+
" National Museum of American History, Library",
110+
"SLA_SRO": "Smithsonian Libraries Archives, Special Research/Operations",
111+
}
112+
49113

50114
def parse_arguments():
51115
"""
@@ -102,7 +166,7 @@ def query_smithsonian(args, session):
102166
" API key is set in .env",
103167
1,
104168
)
105-
LOGGER.info("Fetch CC0 metrics and units from units from Smithsonain")
169+
LOGGER.info("Fetch CC0 metrics and units from units from Smithsonian")
106170
url = "https://api.si.edu/openaccess/api/v1.0/stats"
107171
params = {"api_key": DATA_GOV_API_KEY}
108172
try:
@@ -132,15 +196,16 @@ def query_smithsonian(args, session):
132196
continue
133197
data_units.append(
134198
{
135-
"UNIT": unit["unit"],
199+
"UNIT_CODE": unit["unit"],
200+
"DATA_SOURCE": UNIT_MAP.get(unit["unit"], unit["unit"]),
136201
"CC0_RECORDS": unit["metrics"]["CC0_records"],
137202
"CC0_RECORDS_WITH_CC0_MEDIA": unit["metrics"][
138203
"CC0_records_with_CC0_media"
139204
],
140205
"TOTAL_OBJECTS": unit["total_objects"],
141206
}
142207
)
143-
data_units = sorted(data_units, key=itemgetter("UNIT"))
208+
data_units = sorted(data_units, key=itemgetter("UNIT_CODE"))
144209
LOGGER.info(f"Fetched stats for {len(data_units)} units")
145210
return data_metrics, data_units
146211

Lines changed: 203 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,203 @@
1+
#!/usr/bin/env python
2+
"""
3+
This file is dedicated to processing Smithsonian data
4+
for analysis and comparison between quarters.
5+
"""
6+
7+
# Standard library
8+
import argparse
9+
import os
10+
import sys
11+
import traceback
12+
13+
# Third-party
14+
import pandas as pd
15+
16+
# Add parent directory so shared can be imported
17+
sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
18+
19+
# First-party/Local
20+
import shared # noqa: E402
21+
22+
# Setup
23+
LOGGER, PATHS = shared.setup(__file__)
24+
25+
# Constants
26+
QUARTER = os.path.basename(PATHS["data_quarter"])
27+
FILE_PATHS = [
28+
shared.path_join(PATHS["data_phase"], "smithsonian_totals_by_units.csv"),
29+
shared.path_join(PATHS["data_phase"], "smithsonian_totals_by_records.csv"),
30+
]
31+
32+
33+
def parse_arguments():
34+
"""
35+
Parse command-line options, returns parsed argument namespace.
36+
"""
37+
global QUARTER
38+
LOGGER.info("Parsing command-line options")
39+
parser = argparse.ArgumentParser(description=__doc__)
40+
parser.add_argument(
41+
"--quarter",
42+
default=QUARTER,
43+
help=f"Data quarter in format YYYYQx (default: {QUARTER})",
44+
)
45+
parser.add_argument(
46+
"--enable-save",
47+
action="store_true",
48+
help="Enable saving results (default: False)",
49+
)
50+
parser.add_argument(
51+
"--enable-git",
52+
action="store_true",
53+
help="Enable git actions such as fetch, merge, add, commit, and push"
54+
" (default: False)",
55+
)
56+
parser.add_argument(
57+
"--force",
58+
action="store_true",
59+
help="Regenerate data even if processed files already exist",
60+
)
61+
62+
args = parser.parse_args()
63+
if not args.enable_save and args.enable_git:
64+
parser.error("--enable-git requires --enable-save")
65+
if args.quarter != QUARTER:
66+
global FILE_PATHS, PATHS
67+
FILE_PATHS = shared.paths_list_update(
68+
LOGGER, FILE_PATHS, QUARTER, args.quarter
69+
)
70+
PATHS = shared.paths_update(LOGGER, PATHS, QUARTER, args.quarter)
71+
QUARTER = args.quarter
72+
args.logger = LOGGER
73+
args.paths = PATHS
74+
return args
75+
76+
77+
def process_totals_by_units(args, count_data):
78+
"""
79+
Processing count data: totals by units
80+
"""
81+
LOGGER.info(process_totals_by_units.__doc__.strip())
82+
data = {}
83+
84+
for row in count_data.itertuples(index=False):
85+
unit = str(row.DATA_SOURCE)
86+
total_objects = int(row.TOTAL_OBJECTS)
87+
88+
data[unit] = total_objects
89+
90+
data = pd.DataFrame(data.items(), columns=["Data_source", "Total_objects"])
91+
data.sort_values("Data_source", ascending=True, inplace=True)
92+
data.reset_index(drop=True, inplace=True)
93+
file_path = shared.path_join(
94+
PATHS["data_phase"], "smithsonian_totals_by_units.csv"
95+
)
96+
shared.dataframe_to_csv(args, data, file_path)
97+
98+
99+
def process_totals_by_records(args, count_data):
100+
"""
101+
Processing count data: totals by records
102+
"""
103+
LOGGER.info(process_totals_by_records.__doc__.strip())
104+
data = {}
105+
106+
for row in count_data.itertuples(index=False):
107+
unit = str(row.DATA_SOURCE)
108+
CC0_records = int(row.CC0_RECORDS)
109+
CC0_records_with_CC0_media = int(row.CC0_RECORDS_WITH_CC0_MEDIA)
110+
total_objects = int(row.TOTAL_OBJECTS)
111+
112+
if unit not in data:
113+
data[unit] = {
114+
"CC0_records": 0,
115+
"CC0_records_with_CC0_media": 0,
116+
"Total_objects": 0,
117+
}
118+
119+
data[unit]["CC0_records"] += CC0_records
120+
data[unit]["CC0_records_with_CC0_media"] += CC0_records_with_CC0_media
121+
data[unit]["Total_objects"] += total_objects
122+
123+
data = (
124+
pd.DataFrame.from_dict(data, orient="index")
125+
.reset_index()
126+
.rename(columns={"index": "Data_source"})
127+
)
128+
data["CC0_without_media_percentage"] = (
129+
(
130+
(data["CC0_records"] - data["CC0_records_with_CC0_media"])
131+
/ data["Total_objects"]
132+
)
133+
* 100
134+
).round(2)
135+
136+
data["CC0_with_media_percentage"] = (
137+
(data["CC0_records_with_CC0_media"] / data["Total_objects"]) * 100
138+
).round(2)
139+
140+
data["Others_percentage"] = (
141+
((data["Total_objects"] - data["CC0_records"]) / data["Total_objects"])
142+
* 100
143+
).round(2)
144+
145+
data.sort_values("Data_source", ascending=True, inplace=True)
146+
data.reset_index(drop=True, inplace=True)
147+
148+
file_path = shared.path_join(
149+
PATHS["data_phase"], "smithsonian_totals_by_records.csv"
150+
)
151+
shared.dataframe_to_csv(args, data, file_path)
152+
153+
154+
def main():
155+
args = parse_arguments()
156+
shared.paths_log(LOGGER, PATHS)
157+
shared.git_fetch_and_merge(args, PATHS["repo"])
158+
shared.check_completion_file_exists(args, FILE_PATHS)
159+
file_count = shared.path_join(
160+
PATHS["data_1-fetch"], "smithsonian_2_units.csv"
161+
)
162+
count_data = shared.open_data_file(
163+
LOGGER,
164+
file_count,
165+
usecols=[
166+
"UNIT_CODE",
167+
"DATA_SOURCE",
168+
"CC0_RECORDS",
169+
"CC0_RECORDS_WITH_CC0_MEDIA",
170+
"TOTAL_OBJECTS",
171+
],
172+
)
173+
process_totals_by_units(args, count_data)
174+
process_totals_by_records(args, count_data)
175+
176+
# Push changes
177+
args = shared.git_add_and_commit(
178+
args,
179+
PATHS["repo"],
180+
PATHS["data_quarter"],
181+
f"Add and commit new GitHub data for {QUARTER}",
182+
)
183+
shared.git_push_changes(args, PATHS["repo"])
184+
185+
186+
if __name__ == "__main__":
187+
try:
188+
main()
189+
except shared.QuantifyingException as e:
190+
if e.exit_code == 0:
191+
LOGGER.info(e.message)
192+
else:
193+
LOGGER.error(e.message)
194+
sys.exit(e.exit_code)
195+
except SystemExit as e:
196+
LOGGER.error(f"System exit with code: {e.code}")
197+
sys.exit(e.code)
198+
except KeyboardInterrupt:
199+
LOGGER.info("(130) Halted via KeyboardInterrupt.")
200+
sys.exit(130)
201+
except Exception:
202+
LOGGER.exception(f"(1) Unhandled exception: {traceback.format_exc()}")
203+
sys.exit(1)

0 commit comments

Comments
 (0)