Skip to content

Commit 47b5f20

Browse files
committed
Merge branch 'main' into europeana-feature
2 parents ed87d34 + 253c577 commit 47b5f20

8 files changed

Lines changed: 320 additions & 122 deletions

File tree

dev/create_gcs_query_plan.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -117,7 +117,7 @@ def get_tool_urls():
117117
file_path = shared.path_join(PATHS["data"], "legal-tool-paths.txt")
118118
prefix = "//creativecommons.org/"
119119
tool_urls = []
120-
with open(file_path, "r") as file_obj:
120+
with open(file_path, "r", encoding="utf-8") as file_obj:
121121
for line in file_obj:
122122
tool_urls.append(f"{prefix}{line.strip()}")
123123
LOGGER.info("Prioritizing CC Legal Tool URLs")
@@ -127,14 +127,14 @@ def get_tool_urls():
127127

128128
def load_countries():
129129
file_path = shared.path_join(PATHS["data"], "gcs_country_collection.yaml")
130-
with open(file_path, "r") as file_obj:
130+
with open(file_path, "r", encoding="utf-8") as file_obj:
131131
countries = yaml.safe_load(file_obj)
132132
return countries
133133

134134

135135
def load_languages():
136136
file_path = shared.path_join(PATHS["data"], "gcs_language_collection.yaml")
137-
with open(file_path, "r") as file_obj:
137+
with open(file_path, "r", encoding="utf-8") as file_obj:
138138
languages = yaml.safe_load(file_obj)
139139
return languages
140140

@@ -209,7 +209,7 @@ def save_plan(plan):
209209
"LANGUAGE",
210210
"LR",
211211
]
212-
with open(file_path, "w") as file_obj:
212+
with open(file_path, "w", encoding="utf-8", newline="\n") as file_obj:
213213
writer = csv.DictWriter(
214214
file_obj, fieldnames=fieldnames, dialect="unix"
215215
)

dev/prioritize_tools.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ def get_tool_urls():
4242
file_path = shared.path_join(PATHS["data"], "legal-tool-paths.txt")
4343
prefix = "//creativecommons.org/"
4444
tool_urls = []
45-
with open(file_path, "r") as file_obj:
45+
with open(file_path, "r", encoding="utf-8") as file_obj:
4646
for line in file_obj:
4747
tool_urls.append(f"{prefix}{line.strip()}")
4848
return tool_urls
@@ -112,7 +112,7 @@ def save_tools_list(tool_urls):
112112
LOGGER.info("Saving prioritized CC Legal Tool URLs")
113113
file_path = shared.path_join(PATHS["data"], "prioritized-tool-urls.txt")
114114
tool_urls.append("") # ensure file has end of file newline
115-
with open(file_path, "w") as file_obj:
115+
with open(file_path, "w", encoding="utf-8", newline="\n") as file_obj:
116116
file_obj.writelines("\n".join(tool_urls))
117117

118118

scripts/1-fetch/gcs_fetch.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -104,7 +104,7 @@ def get_search_service():
104104

105105
def initialize_data_file(file_path, header):
106106
if not os.path.isfile(file_path):
107-
with open(file_path, "w", newline="") as file_obj:
107+
with open(file_path, "w", encoding="utf-8", newline="\n") as file_obj:
108108
writer = csv.DictWriter(
109109
file_obj, fieldnames=header, dialect="unix"
110110
)
@@ -127,7 +127,7 @@ def get_last_completed_plan_index():
127127
last_completed_plan_index = 0
128128
for file_path in [FILE1_COUNT, FILE2_LANGUAGE, FILE3_COUNTRY]:
129129
try:
130-
with open(file_path, "r", newline="") as file_obj:
130+
with open(file_path, "r", encoding="utf-8") as file_obj:
131131
reader = csv.DictReader(file_obj, dialect="unix")
132132
for row in reader:
133133
pass # skip through to last row
@@ -147,7 +147,7 @@ def get_last_completed_plan_index():
147147
def load_plan():
148148
plan = []
149149
file_path = shared.path_join(PATHS["data"], "gcs_query_plan.csv")
150-
with open(file_path, "r", newline="") as file_obj:
150+
with open(file_path, "r", encoding="utf-8") as file_obj:
151151
plan = list(csv.DictReader(file_obj, dialect="unix"))
152152
return plan
153153

@@ -181,7 +181,7 @@ def append_data(args, plan_row, index, count):
181181
"TOOL_IDENTIFIER": plan_row["TOOL_IDENTIFIER"],
182182
"COUNT": count,
183183
}
184-
with open(file_path, "a", newline="") as file_obj:
184+
with open(file_path, "a", encoding="utf-8", newline="\n") as file_obj:
185185
writer = csv.DictWriter(
186186
file_obj, fieldnames=fieldnames, dialect="unix"
187187
)

scripts/1-fetch/github_fetch.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -110,7 +110,7 @@ def write_data(args, tool_data):
110110
LOGGER.error("Unable to fetch all records. Aborting.")
111111
return args
112112

113-
with open(FILE1_COUNT, "w", newline="") as file_obj:
113+
with open(FILE1_COUNT, "w", encoding="utf-8", newline="\n") as file_obj:
114114
writer = csv.DictWriter(
115115
file_obj, fieldnames=HEADER1_COUNT, dialect="unix"
116116
)

scripts/1-fetch/wikipedia_fetch.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -83,7 +83,7 @@ def write_data(args, tool_data):
8383
LOGGER.info("Saving fetched data")
8484
os.makedirs(PATHS["data_phase"], exist_ok=True)
8585

86-
with open(FILE_LANGUAGES, "w", newline="", encoding="utf-8") as file_obj:
86+
with open(FILE_LANGUAGES, "w", encoding="utf-8", newline="\n") as file_obj:
8787
writer = csv.DictWriter(
8888
file_obj, fieldnames=HEADER_LANGUAGES, dialect="unix"
8989
)

scripts/2-process/github_process.py

Lines changed: 128 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,18 @@
11
#!/usr/bin/env python
22
"""
3-
This file is dedicated to processing Github data
3+
This file is dedicated to processing GitHub data
44
for analysis and comparison between quarters.
55
"""
66
# Standard library
7+
import argparse
8+
import csv
79
import os
810
import sys
911
import traceback
1012

13+
# Third-party
1114
# import pandas as pd
15+
import pandas as pd
1216

1317
# Add parent directory so shared can be imported
1418
sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
@@ -19,6 +23,112 @@
1923
# Setup
2024
LOGGER, PATHS = shared.setup(__file__)
2125

26+
# Constants
27+
QUARTER = os.path.basename(PATHS["data_quarter"])
28+
29+
30+
def parse_arguments():
31+
"""
32+
Parse command-line options, returns parsed argument namespace.
33+
"""
34+
LOGGER.info("Parsing command-line options")
35+
parser = argparse.ArgumentParser(description=__doc__)
36+
parser.add_argument(
37+
"--quarter",
38+
default=QUARTER,
39+
help=f"Data quarter in format YYYYQx (default: {QUARTER})",
40+
)
41+
parser.add_argument(
42+
"--enable-save",
43+
action="store_true",
44+
help="Enable saving results (default: False)",
45+
)
46+
parser.add_argument(
47+
"--enable-git",
48+
action="store_true",
49+
help="Enable git actions such as fetch, merge, add, commit, and push"
50+
" (default: False)",
51+
)
52+
args = parser.parse_args()
53+
if not args.enable_save and args.enable_git:
54+
parser.error("--enable-git requires --enable-save")
55+
if args.quarter != QUARTER:
56+
global PATHS
57+
PATHS = shared.paths_update(LOGGER, PATHS, QUARTER, args.quarter)
58+
args.logger = LOGGER
59+
args.paths = PATHS
60+
return args
61+
62+
63+
def data_to_csv(args, data, file_path):
64+
if not args.enable_save:
65+
return
66+
os.makedirs(PATHS["data_phase"], exist_ok=True)
67+
# emulate csv.unix_dialect
68+
data.to_csv(
69+
file_path, index=False, quoting=csv.QUOTE_ALL, lineterminator="\n"
70+
)
71+
72+
73+
def process_totals_by_license(args, count_data):
74+
"""
75+
Processing count data: totals by License
76+
"""
77+
LOGGER.info(process_totals_by_license.__doc__.strip())
78+
data = {}
79+
80+
for row in count_data.itertuples(index=False):
81+
tool = str(row.TOOL_IDENTIFIER)
82+
count = int(row.COUNT)
83+
84+
if tool == "Total public repositories":
85+
continue
86+
87+
data[tool] = count
88+
89+
data = pd.DataFrame(data.items(), columns=["License", "Count"])
90+
data.sort_values("License", ascending=True, inplace=True)
91+
data.reset_index(drop=True, inplace=True)
92+
file_path = shared.path_join(
93+
PATHS["data_phase"], "github_totals_by_license.csv"
94+
)
95+
data_to_csv(args, data, file_path)
96+
97+
98+
def process_totals_by_restriction(args, count_data):
99+
"""
100+
Processing count data: totals by restriction
101+
"""
102+
# https://creativecommons.org/public-domain/freeworks/
103+
LOGGER.info(process_totals_by_restriction.__doc__.strip())
104+
data = {"Copyleft": 0, "Permissive": 0, "Public domain": 0}
105+
106+
for row in count_data.itertuples(index=False):
107+
tool = str(row.TOOL_IDENTIFIER)
108+
count = int(row.COUNT)
109+
110+
if tool == "Total public repositories":
111+
continue
112+
113+
if tool in ["BSD Zero Clause License", "CC0 1.0", "Unlicense"]:
114+
key = "Public domain"
115+
elif tool in ["MIT No Attribution", "CC BY 4.0"]:
116+
key = "Permissive"
117+
elif tool in ["CC BY-SA 4.0"]:
118+
key = "Copyleft"
119+
else:
120+
continue
121+
122+
data[key] += count
123+
data = pd.DataFrame(data.items(), columns=["Category", "Count"])
124+
data.sort_values("Category", ascending=True, inplace=True)
125+
data.reset_index(drop=True, inplace=True)
126+
file_path = shared.path_join(
127+
PATHS["data_phase"], "github_totals_by_restriction.csv"
128+
)
129+
data_to_csv(args, data, file_path)
130+
131+
22132
# def load_quarter_data(quarter):
23133
# """
24134
# Load data for a specific quarter.
@@ -63,18 +173,23 @@
63173

64174

65175
def main():
66-
raise shared.QuantifyingException("No current code for Phase 2", 0)
67-
68-
# # Fetch and merge changes
69-
# shared.fetch_and_merge(PATHS["repo"])
70-
71-
# # Add and commit changes
72-
# shared.add_and_commit(
73-
# PATHS["repo"], PATHS["data_quarter"], "Fetched and updated new data"
74-
# )
75-
76-
# # Push changes
77-
# shared.push_changes(PATHS["repo"])
176+
args = parse_arguments()
177+
shared.paths_log(LOGGER, PATHS)
178+
shared.git_fetch_and_merge(args, PATHS["repo"])
179+
180+
file_count = shared.path_join(PATHS["data_1-fetch"], "github_1_count.csv")
181+
count_data = pd.read_csv(file_count, usecols=["TOOL_IDENTIFIER", "COUNT"])
182+
process_totals_by_license(args, count_data)
183+
process_totals_by_restriction(args, count_data)
184+
185+
# Push changes
186+
args = shared.git_add_and_commit(
187+
args,
188+
PATHS["repo"],
189+
PATHS["data_quarter"],
190+
f"Add and commit new GitHub data for {QUARTER}",
191+
)
192+
shared.git_push_changes(args, PATHS["repo"])
78193

79194

80195
if __name__ == "__main__":

0 commit comments

Comments
 (0)