Skip to content

Commit 21551ba

Browse files
committed
Merge branch 'main' into process-report-2025Q4
2 parents d90d811 + ff11b0a commit 21551ba

File tree

8 files changed

+74
-56
lines changed

8 files changed

+74
-56
lines changed

README.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -227,6 +227,13 @@ When run this way, the shared library (`scripts/shared.py`) provides easy access
227227
to all of the necessary paths and all of the modules managed by pipenv are
228228
available.
229229
230+
In order for scripts to be run directly (as shown above), the script must be
231+
executable. For more information on making files executable, please see:
232+
[File Permissions - Foundational technologies — Creative Commons Open
233+
Source][file-perms].
234+
235+
[file-perms]: https://opensource.creativecommons.org/contributing-code/foundational-tech/#file-permissions
236+
230237
231238
### Static analysis
232239

scripts/2-process/gcs_process.py

Lines changed: 7 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@
44
"""
55
# Standard library
66
import argparse
7-
import csv
87
import os
98
import sys
109
import textwrap
@@ -82,16 +81,6 @@ def parse_arguments():
8281
return args
8382

8483

85-
def data_to_csv(args, data, file_path):
86-
if not args.enable_save:
87-
return
88-
os.makedirs(PATHS["data_phase"], exist_ok=True)
89-
# emulate csv.unix_dialect
90-
data.to_csv(
91-
file_path, index=False, quoting=csv.QUOTE_ALL, lineterminator="\n"
92-
)
93-
94-
9584
def process_product_totals(args, count_data):
9685
"""
9786
Processing count data: totals by product
@@ -131,7 +120,7 @@ def process_product_totals(args, count_data):
131120
data.items(), columns=["CC legal tool product", "Count"]
132121
)
133122
file_path = shared.path_join(PATHS["data_phase"], "gcs_product_totals.csv")
134-
data_to_csv(args, data, file_path)
123+
shared.data_to_csv(args, data, file_path)
135124

136125

137126
def process_latest_prior_retired_totals(args, count_data):
@@ -212,7 +201,7 @@ def process_latest_prior_retired_totals(args, count_data):
212201
file_path = shared.path_join(
213202
PATHS["data_phase"], f"gcs_status_{key}_totals.csv"
214203
)
215-
data_to_csv(args, dataframe, file_path)
204+
shared.data_to_csv(args, dataframe, file_path)
216205

217206

218207
def process_totals_by_free_cultural(args, count_data):
@@ -245,7 +234,7 @@ def process_totals_by_free_cultural(args, count_data):
245234
file_path = shared.path_join(
246235
PATHS["data_phase"], "gcs_totals_by_free_cultural.csv"
247236
)
248-
data_to_csv(args, data, file_path)
237+
shared.data_to_csv(args, data, file_path)
249238

250239

251240
def process_totals_by_restrictions(args, count_data):
@@ -279,7 +268,7 @@ def process_totals_by_restrictions(args, count_data):
279268
file_path = shared.path_join(
280269
PATHS["data_phase"], "gcs_totals_by_restrictions.csv"
281270
)
282-
data_to_csv(args, data, file_path)
271+
shared.data_to_csv(args, data, file_path)
283272

284273

285274
def process_totals_by_language(args, data):
@@ -300,7 +289,7 @@ def process_totals_by_language(args, data):
300289
file_path = shared.path_join(
301290
PATHS["data_phase"], "gcs_totals_by_language.csv"
302291
)
303-
data_to_csv(args, data, file_path)
292+
shared.data_to_csv(args, data, file_path)
304293

305294

306295
def process_totals_by_country(args, data):
@@ -321,14 +310,14 @@ def process_totals_by_country(args, data):
321310
file_path = shared.path_join(
322311
PATHS["data_phase"], "gcs_totals_by_country.csv"
323312
)
324-
data_to_csv(args, data, file_path)
313+
shared.data_to_csv(args, data, file_path)
325314

326315

327316
def main():
328317
args = parse_arguments()
329318
shared.paths_log(LOGGER, PATHS)
330319
shared.git_fetch_and_merge(args, PATHS["repo"])
331-
shared.check_for_data_files(args, FILE_PATHS, QUARTER)
320+
shared.check_completion_file_exists(args, FILE_PATHS)
332321

333322
# Count data
334323
file1_count = shared.path_join(PATHS["data_1-fetch"], "gcs_1_count.csv")

scripts/2-process/github_process.py

Lines changed: 3 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@
55
"""
66
# Standard library
77
import argparse
8-
import csv
98
import os
109
import sys
1110
import traceback
@@ -74,16 +73,6 @@ def parse_arguments():
7473
return args
7574

7675

77-
def data_to_csv(args, data, file_path):
78-
if not args.enable_save:
79-
return
80-
os.makedirs(PATHS["data_phase"], exist_ok=True)
81-
# emulate csv.unix_dialect
82-
data.to_csv(
83-
file_path, index=False, quoting=csv.QUOTE_ALL, lineterminator="\n"
84-
)
85-
86-
8776
def process_totals_by_license(args, count_data):
8877
"""
8978
Processing count data: totals by License
@@ -106,7 +95,7 @@ def process_totals_by_license(args, count_data):
10695
file_path = shared.path_join(
10796
PATHS["data_phase"], "github_totals_by_license.csv"
10897
)
109-
data_to_csv(args, data, file_path)
98+
shared.data_to_csv(args, data, file_path)
11099

111100

112101
def process_totals_by_restriction(args, count_data):
@@ -140,14 +129,14 @@ def process_totals_by_restriction(args, count_data):
140129
file_path = shared.path_join(
141130
PATHS["data_phase"], "github_totals_by_restriction.csv"
142131
)
143-
data_to_csv(args, data, file_path)
132+
shared.data_to_csv(args, data, file_path)
144133

145134

146135
def main():
147136
args = parse_arguments()
148137
shared.paths_log(LOGGER, PATHS)
149138
shared.git_fetch_and_merge(args, PATHS["repo"])
150-
shared.check_for_data_files(args, FILE_PATHS, QUARTER)
139+
shared.check_completion_file_exists(args, FILE_PATHS)
151140
file_count = shared.path_join(PATHS["data_1-fetch"], "github_1_count.csv")
152141
count_data = shared.open_data_file(
153142
LOGGER, file_count, usecols=["TOOL_IDENTIFIER", "COUNT"]

scripts/2-process/wikipedia_process.py

Lines changed: 4 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@
55
"""
66
# Standard library
77
import argparse
8-
import csv
98
import os
109
import sys
1110
import textwrap
@@ -85,16 +84,6 @@ def parse_arguments():
8584
return args
8685

8786

88-
def data_to_csv(args, data, file_path):
89-
if not args.enable_save:
90-
return
91-
os.makedirs(PATHS["data_phase"], exist_ok=True)
92-
# emulate csv.unix_dialect
93-
data.to_csv(
94-
file_path, index=False, quoting=csv.QUOTE_ALL, lineterminator="\n"
95-
)
96-
97-
9887
def process_highest_language_usage(args, count_data):
9988
"""
10089
Processing count data: Most represented languages
@@ -113,7 +102,7 @@ def process_highest_language_usage(args, count_data):
113102
file_path = shared.path_join(
114103
PATHS["data_phase"], "wikipedia_highest_language_usage.csv"
115104
)
116-
data_to_csv(args, top_10, file_path)
105+
shared.data_to_csv(args, top_10, file_path)
117106

118107

119108
def process_least_language_usage(args, count_data):
@@ -136,7 +125,7 @@ def process_least_language_usage(args, count_data):
136125
file_path = shared.path_join(
137126
PATHS["data_phase"], "wikipedia_least_language_usage.csv"
138127
)
139-
data_to_csv(args, bottom_10, file_path)
128+
shared.data_to_csv(args, bottom_10, file_path)
140129

141130

142131
def process_language_representation(args, count_data):
@@ -162,14 +151,14 @@ def process_language_representation(args, count_data):
162151
file_path = shared.path_join(
163152
PATHS["data_phase"], "wikipedia_language_representation.csv"
164153
)
165-
data_to_csv(args, language_counts, file_path)
154+
shared.data_to_csv(args, language_counts, file_path)
166155

167156

168157
def main():
169158
args = parse_arguments()
170159
shared.paths_log(LOGGER, PATHS)
171160
shared.git_fetch_and_merge(args, PATHS["repo"])
172-
shared.check_for_data_files(args, FILE_PATHS, QUARTER)
161+
shared.check_completion_file_exists(args, FILE_PATHS)
173162
file_count = shared.path_join(
174163
PATHS["data_1-fetch"], "wikipedia_count_by_languages.csv"
175164
)

scripts/3-report/gcs_report.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ def parse_arguments():
3636
"""
3737
Parses command-line arguments, returns parsed arguments.
3838
"""
39+
global QUARTER
3940
LOGGER.info("Parsing command-line arguments")
4041
parser = argparse.ArgumentParser(description=__doc__)
4142
parser.add_argument(
@@ -59,12 +60,18 @@ def parse_arguments():
5960
help="Enable git actions such as fetch, merge, add, commit, and push"
6061
" (default: False)",
6162
)
63+
parser.add_argument(
64+
"--force",
65+
action="store_true",
66+
help="Regenerate data even if report files exist",
67+
)
6268
args = parser.parse_args()
6369
if not args.enable_save and args.enable_git:
6470
parser.error("--enable-git requires --enable-save")
6571
if args.quarter != QUARTER:
6672
global PATHS
6773
PATHS = shared.paths_update(LOGGER, PATHS, QUARTER, args.quarter)
74+
QUARTER = args.quarter
6875
args.logger = LOGGER
6976
args.paths = PATHS
7077
return args
@@ -491,7 +498,8 @@ def main():
491498
args = parse_arguments()
492499
shared.paths_log(LOGGER, PATHS)
493500
shared.git_fetch_and_merge(args, PATHS["repo"])
494-
501+
last_entry = shared.path_join(PATHS["data_phase"], "gcs_free_culture.png")
502+
shared.check_completion_file_exists(args, last_entry)
495503
gcs_intro(args)
496504
plot_products(args)
497505
plot_tool_status(args)

scripts/3-report/github_report.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ def parse_arguments():
3434
"""
3535
Parses command-line arguments, returns parsed arguments.
3636
"""
37+
global QUARTER
3738
LOGGER.info("Parsing command-line arguments")
3839
parser = argparse.ArgumentParser(description=__doc__)
3940
parser.add_argument(
@@ -60,14 +61,15 @@ def parse_arguments():
6061
parser.add_argument(
6162
"--force",
6263
action="store_true",
63-
help="Regenerate data even if images files already exist",
64+
help="Regenerate data even if report files exist",
6465
)
6566
args = parser.parse_args()
6667
if not args.enable_save and args.enable_git:
6768
parser.error("--enable-git requires --enable-save")
6869
if args.quarter != QUARTER:
6970
global PATHS
7071
PATHS = shared.paths_update(LOGGER, PATHS, QUARTER, args.quarter)
72+
QUARTER = args.quarter
7173
args.logger = LOGGER
7274
args.paths = PATHS
7375
return args
@@ -243,6 +245,10 @@ def main():
243245
args = parse_arguments()
244246
shared.paths_log(LOGGER, PATHS)
245247
shared.git_fetch_and_merge(args, PATHS["repo"])
248+
last_entry = shared.path_join(
249+
PATHS["data_phase"], "github_restriction.png"
250+
)
251+
shared.check_completion_file_exists(args, last_entry)
246252
github_intro(args)
247253
plot_totals_by_license_type(args)
248254
plot_totals_by_restriction(args)

scripts/3-report/wikipedia_report.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ def parse_arguments():
3434
"""
3535
Parses command-line arguments, returns parsed arguments.
3636
"""
37+
global QUARTER
3738
LOGGER.info("Parsing command-line arguments")
3839
parser = argparse.ArgumentParser(description=__doc__)
3940
parser.add_argument(
@@ -57,12 +58,18 @@ def parse_arguments():
5758
help="Enable git actions such as fetch, merge, add, commit, and push"
5859
" (default: False)",
5960
)
61+
parser.add_argument(
62+
"--force",
63+
action="store_true",
64+
help="Regenerate data even if report files exist",
65+
)
6066
args = parser.parse_args()
6167
if not args.enable_save and args.enable_git:
6268
parser.error("--enable-git requires --enable-save")
6369
if args.quarter != QUARTER:
6470
global PATHS
6571
PATHS = shared.paths_update(LOGGER, PATHS, QUARTER, args.quarter)
72+
QUARTER = args.quarter
6673
args.logger = LOGGER
6774
args.paths = PATHS
6875
return args
@@ -261,6 +268,10 @@ def main():
261268
args = parse_arguments()
262269
shared.paths_log(LOGGER, PATHS)
263270
shared.git_fetch_and_merge(args, PATHS["repo"])
271+
last_entry = shared.path_join(
272+
PATHS["data_phase"], "wikipedia_least_language_usage.png"
273+
)
274+
shared.check_completion_file_exists(args, last_entry)
264275
wikipedia_intro(args)
265276
plot_language_representation(args)
266277
plot_highest_language_usage(args)

scripts/shared.py

Lines changed: 26 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
# Standard library
2+
import csv
23
import logging
34
import os
45
import sys
@@ -36,13 +37,31 @@ def __init__(self, message, exit_code=None):
3637
super().__init__(self.message)
3738

3839

39-
def check_for_data_files(args, file_paths, QUARTER):
40+
def data_to_csv(args, data, file_path):
41+
if not args.enable_save:
42+
return
43+
os.makedirs(args.paths["data_phase"], exist_ok=True)
44+
# emulate csv.unix_dialect
45+
data.to_csv(
46+
file_path, index=False, quoting=csv.QUOTE_ALL, lineterminator="\n"
47+
)
48+
49+
50+
def check_completion_file_exists(args, file_paths):
51+
""" "
52+
This function checks if expected output files
53+
exists. If any exist and --force is not provided,
54+
the script exits early by raising a QuantifyingException.
55+
In the case of a report file, we check if last output exists.
56+
"""
4057
if args.force:
4158
return
59+
if isinstance(file_paths, str):
60+
file_paths = [file_paths]
4261
for path in file_paths:
4362
if os.path.exists(path):
4463
raise QuantifyingException(
45-
f"Processed data already exists for {QUARTER}", 0
64+
f"Output files already exists for {args.quarter}", 0
4665
)
4766

4867

@@ -326,12 +345,12 @@ def update_readme(
326345
readme_path = path_join(paths["data"], args.quarter, "README.md")
327346

328347
# Define section markers for each data source
329-
section_start_line = f"<!-- section start {section_file} -->\n"
330-
section_end_line = f"<!-- section end {section_file} -->\n"
348+
section_start_line = f"<!-- SECTION start {section_file} -->\n"
349+
section_end_line = f"<!-- SECTION end {section_file} -->\n"
331350

332351
# Define entry markers for each plot (optional) and description
333-
entry_start_line = f"<!-- entry start {entry_title} -->\n"
334-
entry_end_line = f"<!-- entry end {entry_title} -->\n"
352+
entry_start_line = f"<!-- {section_file} entry start {entry_title} -->\n"
353+
entry_end_line = f"<!-- {section_file} entry end {entry_title} -->\n"
335354

336355
if os.path.exists(readme_path):
337356
with open(readme_path, "r", encoding="utf-8") as f:
@@ -356,7 +375,7 @@ def update_readme(
356375
sections_before = ordered_sections[:current_postion]
357376
# we find the last existing section that comes before this section
358377
for prev_section_title in reversed(sections_before):
359-
prev_end_line = f"<!-- section end {prev_section_title} -->\n"
378+
prev_end_line = f"<!-- SECTION end {prev_section_title} -->\n"
360379
if prev_end_line in lines:
361380
insert_index = lines.index(prev_end_line) + 1
362381
break

0 commit comments

Comments
 (0)