Skip to content

Commit 16c26c3

Browse files
authored
Merge branch 'main' into data
2 parents 958d291 + dceb663 commit 16c26c3

File tree

8 files changed

+185
-77
lines changed

8 files changed

+185
-77
lines changed

scripts/2-process/gcs_process.py

Lines changed: 24 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -26,12 +26,24 @@
2626

2727
# Constants
2828
QUARTER = os.path.basename(PATHS["data_quarter"])
29+
FILE_PATHS = [
30+
shared.path_join(PATHS["data_phase"], "gcs_product_totals.csv"),
31+
shared.path_join(PATHS["data_phase"], "gcs_status_combined_totals.csv"),
32+
shared.path_join(PATHS["data_phase"], "gcs_status_lastest_totals.csv"),
33+
shared.path_join(PATHS["data_phase"], "gcs_status_prior_totals.csv"),
34+
shared.path_join(PATHS["data_phase"], "gcs_status_retired_totals.csv"),
35+
shared.path_join(PATHS["data_phase"], "gcs_totals_by_country.csv"),
36+
shared.path_join(PATHS["data_phase"], "gcs_totals_by_free_cultural.csv"),
37+
shared.path_join(PATHS["data_phase"], "gcs_totals_by_language.csv"),
38+
shared.path_join(PATHS["data_phase"], "gcs_totals_by_restrictions.csv"),
39+
]
2940

3041

3142
def parse_arguments():
3243
"""
3344
Parse command-line options, returns parsed argument namespace.
3445
"""
46+
global QUARTER
3547
LOGGER.info("Parsing command-line options")
3648
parser = argparse.ArgumentParser(description=__doc__)
3749
parser.add_argument(
@@ -47,15 +59,23 @@ def parse_arguments():
4759
parser.add_argument(
4860
"--enable-git",
4961
action="store_true",
50-
help="Enable git actions such as fetch, merge, add, commit, and push"
51-
" (default: False)",
62+
help="Enable git actions such as fetch, merge, add, commit, and push",
63+
)
64+
parser.add_argument(
65+
"--force",
66+
action="store_true",
67+
help="Regenerate data even if processed files already exist",
5268
)
5369
args = parser.parse_args()
5470
if not args.enable_save and args.enable_git:
5571
parser.error("--enable-git requires --enable-save")
5672
if args.quarter != QUARTER:
57-
global PATHS
73+
global FILE_PATHS, PATHS
74+
FILE_PATHS = shared.paths_list_update(
75+
LOGGER, FILE_PATHS, QUARTER, args.quarter
76+
)
5877
PATHS = shared.paths_update(LOGGER, PATHS, QUARTER, args.quarter)
78+
QUARTER = args.quarter
5979
args.logger = LOGGER
6080
args.paths = PATHS
6181
return args
@@ -297,6 +317,7 @@ def main():
297317
args = parse_arguments()
298318
shared.paths_log(LOGGER, PATHS)
299319
shared.git_fetch_and_merge(args, PATHS["repo"])
320+
shared.check_for_data_files(args, FILE_PATHS, QUARTER)
300321

301322
# Count data
302323
file1_count = shared.path_join(PATHS["data_1-fetch"], "gcs_1_count.csv")

scripts/2-process/github_process.py

Lines changed: 19 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -23,12 +23,17 @@
2323

2424
# Constants
2525
QUARTER = os.path.basename(PATHS["data_quarter"])
26+
FILE_PATHS = [
27+
shared.path_join(PATHS["data_phase"], "github_totals_by_license.csv"),
28+
shared.path_join(PATHS["data_phase"], "github_totals_by_restriction.csv"),
29+
]
2630

2731

2832
def parse_arguments():
2933
"""
3034
Parse command-line options, returns parsed argument namespace.
3135
"""
36+
global QUARTER
3237
LOGGER.info("Parsing command-line options")
3338
parser = argparse.ArgumentParser(description=__doc__)
3439
parser.add_argument(
@@ -47,24 +52,27 @@ def parse_arguments():
4752
help="Enable git actions such as fetch, merge, add, commit, and push"
4853
" (default: False)",
4954
)
55+
parser.add_argument(
56+
"--force",
57+
action="store_true",
58+
help="Regenerate data even if processed files already exist",
59+
)
60+
5061
args = parser.parse_args()
5162
if not args.enable_save and args.enable_git:
5263
parser.error("--enable-git requires --enable-save")
5364
if args.quarter != QUARTER:
54-
global PATHS
65+
global FILE_PATHS, PATHS
66+
FILE_PATHS = shared.paths_list_update(
67+
LOGGER, FILE_PATHS, QUARTER, args.quarter
68+
)
5569
PATHS = shared.paths_update(LOGGER, PATHS, QUARTER, args.quarter)
70+
QUARTER = args.quarter
5671
args.logger = LOGGER
5772
args.paths = PATHS
5873
return args
5974

6075

61-
def check_for_data_file(file_path):
62-
if os.path.exists(file_path):
63-
raise shared.QuantifyingException(
64-
f"Processed data already exists for {QUARTER}", 0
65-
)
66-
67-
6876
def process_totals_by_license(args, count_data):
6977
"""
7078
Processing count data: totals by License
@@ -86,9 +94,7 @@ def process_totals_by_license(args, count_data):
8694
data.reset_index(drop=True, inplace=True)
8795
file_path = shared.path_join(
8896
PATHS["data_phase"], "github_totals_by_license.csv"
89-
)
90-
check_for_data_file(file_path)
91-
shared.data_to_csv(args, data, file_path, PATHS)
97+
shared.data_to_csv(args, data, file_paths, PATH)
9298

9399

94100
def process_totals_by_restriction(args, count_data):
@@ -122,15 +128,14 @@ def process_totals_by_restriction(args, count_data):
122128
file_path = shared.path_join(
123129
PATHS["data_phase"], "github_totals_by_restriction.csv"
124130
)
125-
check_for_data_file(file_path)
126131
shared.data_to_csv(args, data, file_path, PATHS)
127132

128133

129134
def main():
130135
args = parse_arguments()
131136
shared.paths_log(LOGGER, PATHS)
132137
shared.git_fetch_and_merge(args, PATHS["repo"])
133-
138+
shared.check_for_data_files(args, FILE_PATHS, QUARTER)
134139
file_count = shared.path_join(PATHS["data_1-fetch"], "github_1_count.csv")
135140
count_data = shared.open_data_file(
136141
LOGGER, file_count, usecols=["TOOL_IDENTIFIER", "COUNT"]
@@ -156,7 +161,7 @@ def main():
156161
LOGGER.info(e.message)
157162
else:
158163
LOGGER.error(e.message)
159-
sys.exit(e.code)
164+
sys.exit(e.exit_code)
160165
except SystemExit as e:
161166
LOGGER.error(f"System exit with code: {e.code}")
162167
sys.exit(e.code)

scripts/2-process/wikipedia_process.py

Lines changed: 24 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -27,12 +27,24 @@
2727

2828
# Constants
2929
QUARTER = os.path.basename(PATHS["data_quarter"])
30+
FILE_PATHS = [
31+
shared.path_join(
32+
PATHS["data_phase"], "wikipedia_highest_language_usage.csv"
33+
),
34+
shared.path_join(
35+
PATHS["data_phase"], "wikipedia_least_language_usage.csv"
36+
),
37+
shared.path_join(
38+
PATHS["data_phase"], "wikipedia_language_representation.csv"
39+
),
40+
]
3041

3142

3243
def parse_arguments():
3344
"""
3445
Parse command-line options, returns parsed argument namespace.
3546
"""
47+
global QUARTER
3648
LOGGER.info("Parsing command-line options")
3749
parser = argparse.ArgumentParser(description=__doc__)
3850
parser.add_argument(
@@ -51,24 +63,27 @@ def parse_arguments():
5163
help="Enable git actions such as fetch, merge, add, commit, and push"
5264
" (default: False)",
5365
)
66+
parser.add_argument(
67+
"--force",
68+
action="store_true",
69+
help="Regenerate data even if processed files already exist",
70+
)
71+
5472
args = parser.parse_args()
5573
if not args.enable_save and args.enable_git:
5674
parser.error("--enable-git requires --enable-save")
5775
if args.quarter != QUARTER:
58-
global PATHS
76+
global FILE_PATHS, PATHS
77+
FILE_PATHS = shared.paths_list_update(
78+
LOGGER, FILE_PATHS, QUARTER, args.quarter
79+
)
5980
PATHS = shared.paths_update(LOGGER, PATHS, QUARTER, args.quarter)
81+
QUARTER = args.quarter
6082
args.logger = LOGGER
6183
args.paths = PATHS
6284
return args
6385

6486

65-
def check_for_data_file(file_path):
66-
if os.path.exists(file_path):
67-
raise shared.QuantifyingException(
68-
f"Processed data already exists for {QUARTER}", 0
69-
)
70-
71-
7287
def process_highest_language_usage(args, count_data):
7388
"""
7489
Processing count data: Most represented languages
@@ -87,7 +102,6 @@ def process_highest_language_usage(args, count_data):
87102
file_path = shared.path_join(
88103
PATHS["data_phase"], "wikipedia_highest_language_usage.csv"
89104
)
90-
check_for_data_file(file_path)
91105
shared.data_to_csv(args, top_10, file_path, PATHS)
92106

93107

@@ -111,10 +125,8 @@ def process_least_language_usage(args, count_data):
111125
file_path = shared.path_join(
112126
PATHS["data_phase"], "wikipedia_least_language_usage.csv"
113127
)
114-
check_for_data_file(file_path)
115128
shared.data_to_csv(args, bottom_10, file_path, PATHS)
116129

117-
118130
def process_language_representation(args, count_data):
119131
"""
120132
Processing count data: Language representation
@@ -138,14 +150,14 @@ def process_language_representation(args, count_data):
138150
file_path = shared.path_join(
139151
PATHS["data_phase"], "wikipedia_language_representation.csv"
140152
)
141-
check_for_data_file(file_path)
142153
shared.data_to_csv(args, language_counts, file_path, PATHS)
143154

144155

145156
def main():
146157
args = parse_arguments()
147158
shared.paths_log(LOGGER, PATHS)
148159
shared.git_fetch_and_merge(args, PATHS["repo"])
160+
shared.check_for_data_files(args, FILE_PATHS, QUARTER)
149161
file_count = shared.path_join(
150162
PATHS["data_1-fetch"], "wikipedia_count_by_languages.csv"
151163
)

scripts/3-report/gcs_report.py

Lines changed: 21 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
import sys
1010
import textwrap
1111
import traceback
12+
from pathlib import Path
1213

1314
# Third-party
1415
from pygments import highlight
@@ -27,7 +28,8 @@
2728

2829
# Constants
2930
QUARTER = os.path.basename(PATHS["data_quarter"])
30-
SECTION = "Google Custom Search (GCS)"
31+
SECTION_FILE = Path(__file__).name
32+
SECTION_TITLE = "Google Custom Search (GCS)"
3133

3234

3335
def parse_arguments():
@@ -83,7 +85,8 @@ def gcs_intro(args):
8385
total_count = f"{data['Count'].sum():,d}"
8486
shared.update_readme(
8587
args,
86-
SECTION,
88+
SECTION_FILE,
89+
SECTION_TITLE,
8790
"Overview",
8891
None,
8992
None,
@@ -137,7 +140,8 @@ def plot_products(args):
137140

138141
shared.update_readme(
139142
args,
140-
SECTION,
143+
SECTION_FILE,
144+
SECTION_TITLE,
141145
title,
142146
image_path,
143147
"Plots showing Creative Commons (CC) legal tool product totals and"
@@ -180,7 +184,8 @@ def plot_tool_status(args):
180184

181185
shared.update_readme(
182186
args,
183-
SECTION,
187+
SECTION_FILE,
188+
SECTION_TITLE,
184189
title,
185190
image_path,
186191
"Plots showing Creative Commons (CC) legal tool status totals and"
@@ -223,7 +228,8 @@ def plot_latest_tools(args):
223228

224229
shared.update_readme(
225230
args,
226-
SECTION,
231+
SECTION_FILE,
232+
SECTION_TITLE,
227233
title,
228234
image_path,
229235
"Plots showing latest Creative Commons (CC) legal tool totals and"
@@ -265,7 +271,8 @@ def plot_prior_tools(args):
265271

266272
shared.update_readme(
267273
args,
268-
SECTION,
274+
SECTION_FILE,
275+
SECTION_TITLE,
269276
title,
270277
image_path,
271278
"Plots showing prior Creative Commons (CC) legal tool totals and"
@@ -311,7 +318,8 @@ def plot_retired_tools(args):
311318

312319
shared.update_readme(
313320
args,
314-
SECTION,
321+
SECTION_FILE,
322+
SECTION_TITLE,
315323
title,
316324
image_path,
317325
"Plots showing retired Creative Commons (CC) legal tools total and"
@@ -360,7 +368,8 @@ def plot_countries_highest_usage(args):
360368

361369
shared.update_readme(
362370
args,
363-
SECTION,
371+
SECTION_FILE,
372+
SECTION_TITLE,
364373
title,
365374
image_path,
366375
"Plots showing countries with the highest useage of the latest"
@@ -413,7 +422,8 @@ def plot_languages_highest_usage(args):
413422

414423
shared.update_readme(
415424
args,
416-
SECTION,
425+
SECTION_FILE,
426+
SECTION_TITLE,
417427
title,
418428
image_path,
419429
"Plots showing languages with the highest useage of the latest"
@@ -460,7 +470,8 @@ def plot_free_culture(args):
460470

461471
shared.update_readme(
462472
args,
463-
SECTION,
473+
SECTION_FILE,
474+
SECTION_TITLE,
464475
title,
465476
image_path,
466477
"Plots showing Approved for Free Cultural Works legal tool usage.",

0 commit comments

Comments
 (0)