Skip to content

Commit 121c971

Browse files
authored
Merge pull request #284 from creativecommons/upgrade_csv
Consolidate and clean-up Fetch CSV file interactions
2 parents e1f49e7 + a0394da commit 121c971

File tree

10 files changed

+64
-169
lines changed

10 files changed

+64
-169
lines changed

scripts/1-fetch/arxiv_fetch.py

Lines changed: 8 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -125,31 +125,6 @@ def parse_arguments():
125125
return args
126126

127127

128-
def initialize_data_file(file_path, headers):
129-
"""Initialize CSV file with headers if it doesn't exist."""
130-
if not os.path.isfile(file_path):
131-
with open(file_path, "w", encoding="utf-8", newline="\n") as file_obj:
132-
writer = csv.DictWriter(
133-
file_obj, fieldnames=headers, dialect="unix"
134-
)
135-
writer.writeheader()
136-
137-
138-
def initialize_all_data_files(args):
139-
"""Initialize all data files used by this script.
140-
141-
Creates the data directory and initializes empty CSVs with headers.
142-
"""
143-
if not args.enable_save:
144-
return
145-
146-
os.makedirs(PATHS["data_1-fetch"], exist_ok=True)
147-
initialize_data_file(FILE_ARXIV_COUNT, HEADER_COUNT)
148-
initialize_data_file(FILE_ARXIV_CATEGORY_REPORT, HEADER_CATEGORY_REPORT)
149-
initialize_data_file(FILE_ARXIV_YEAR, HEADER_YEAR)
150-
initialize_data_file(FILE_ARXIV_AUTHOR_BUCKET, HEADER_AUTHOR_BUCKET)
151-
152-
153128
def get_identifier_mapping():
154129
global IDENTIER_MAPPING
155130
LOGGER.info("Loading CC Legal Tool metadata for CC identifer mapping")
@@ -472,19 +447,6 @@ def query_arxiv(args, session):
472447
return data, cc_articles_found
473448

474449

475-
def rows_to_csv(args, fieldnames, rows, file_path):
476-
if not args.enable_save:
477-
return args
478-
479-
with open(file_path, "w", encoding="utf-8", newline="\n") as file_handle:
480-
writer = csv.DictWriter(
481-
file_handle, fieldnames=fieldnames, dialect="unix"
482-
)
483-
writer.writeheader()
484-
for row in rows:
485-
writer.writerow(row)
486-
487-
488450
def write_data(args, data):
489451
"""
490452
Write fetched data to CSV files.
@@ -508,7 +470,9 @@ def write_data(args, data):
508470
}
509471
)
510472
rows.sort(key=itemgetter("TOOL_IDENTIFIER", "AUTHOR_BUCKET"))
511-
rows_to_csv(args, HEADER_AUTHOR_BUCKET, rows, FILE_ARXIV_AUTHOR_BUCKET)
473+
shared.rows_to_csv(
474+
args, FILE_ARXIV_AUTHOR_BUCKET, HEADER_AUTHOR_BUCKET, rows
475+
)
512476

513477
# Save category report
514478
# fetched_data["category_counts"]: {identifer: {category_code: count}}
@@ -527,15 +491,17 @@ def write_data(args, data):
527491
}
528492
)
529493
rows.sort(key=itemgetter("TOOL_IDENTIFIER", "CATEGORY_CODE"))
530-
rows_to_csv(args, HEADER_CATEGORY_REPORT, rows, FILE_ARXIV_CATEGORY_REPORT)
494+
shared.rows_to_csv(
495+
args, FILE_ARXIV_CATEGORY_REPORT, HEADER_CATEGORY_REPORT, rows
496+
)
531497

532498
# Save tool counts report
533499
# fetched_data["tool_counts"]: {identfier: count}
534500
rows = []
535501
for identifier, count in data["tool_counts"].items():
536502
rows.append({"TOOL_IDENTIFIER": identifier, "COUNT": count})
537503
rows.sort(key=itemgetter("TOOL_IDENTIFIER"))
538-
rows_to_csv(args, HEADER_COUNT, rows, FILE_ARXIV_COUNT)
504+
shared.rows_to_csv(args, FILE_ARXIV_COUNT, HEADER_COUNT, rows)
539505

540506
# Save year count report
541507
# fetched_data["year_counts"]: {identifer: {year: count}}
@@ -546,7 +512,7 @@ def write_data(args, data):
546512
{"TOOL_IDENTIFIER": identifier, "YEAR": year, "COUNT": count}
547513
)
548514
rows.sort(key=itemgetter("TOOL_IDENTIFIER", "YEAR"))
549-
rows_to_csv(args, HEADER_YEAR, rows, FILE_ARXIV_YEAR)
515+
shared.rows_to_csv(args, FILE_ARXIV_YEAR, HEADER_YEAR, rows)
550516

551517

552518
def write_provence(args, cc_articles_found):
@@ -584,7 +550,6 @@ def main():
584550
args = parse_arguments()
585551
shared.paths_log(LOGGER, PATHS)
586552
shared.git_fetch_and_merge(args, PATHS["repo"])
587-
initialize_all_data_files(args)
588553
get_identifier_mapping()
589554
session = shared.get_session()
590555
query_category_mapping(args, session)

scripts/1-fetch/gcs_fetch.py

Lines changed: 8 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -99,25 +99,14 @@ def get_search_service():
9999
)
100100

101101

102-
def initialize_data_file(file_path, header):
103-
if not os.path.isfile(file_path):
104-
with open(file_path, "w", encoding="utf-8", newline="\n") as file_obj:
105-
writer = csv.DictWriter(
106-
file_obj, fieldnames=header, dialect="unix"
107-
)
108-
writer.writeheader()
109-
110-
111102
def initialize_all_data_files(args):
112-
if not args.enable_save:
113-
return
114-
115-
# Create data directory for this phase
116-
os.makedirs(PATHS["data_phase"], exist_ok=True)
117-
118-
initialize_data_file(FILE1_COUNT, HEADER1_COUNT)
119-
initialize_data_file(FILE2_LANGUAGE, HEADER2_LANGUAGE)
120-
initialize_data_file(FILE3_COUNTRY, HEADER3_COUNTRY)
103+
for file_path, header in [
104+
(FILE1_COUNT, HEADER1_COUNT),
105+
(FILE2_LANGUAGE, HEADER2_LANGUAGE),
106+
(FILE3_COUNTRY, HEADER3_COUNTRY),
107+
]:
108+
if not os.path.isfile(file_path):
109+
shared.rows_to_csv(args, file_path, header, [])
121110

122111

123112
def get_last_completed_plan_index():
@@ -150,8 +139,6 @@ def load_plan():
150139

151140

152141
def append_data(args, plan_row, index, count):
153-
if not args.enable_save:
154-
return
155142
if plan_row["COUNTRY"]:
156143
file_path = FILE3_COUNTRY
157144
fieldnames = HEADER3_COUNTRY
@@ -178,11 +165,7 @@ def append_data(args, plan_row, index, count):
178165
"TOOL_IDENTIFIER": plan_row["TOOL_IDENTIFIER"],
179166
"COUNT": count,
180167
}
181-
with open(file_path, "a", encoding="utf-8", newline="\n") as file_obj:
182-
writer = csv.DictWriter(
183-
file_obj, fieldnames=fieldnames, dialect="unix"
184-
)
185-
writer.writerow(row)
168+
shared.rows_to_csv(args, file_path, fieldnames, [row], append=True)
186169

187170

188171
def query_gcs(args, service, last_completed_plan_index, plan):

scripts/1-fetch/github_fetch.py

Lines changed: 7 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@
2828
LOGGER, PATHS = shared.setup(__file__)
2929

3030
# Constants
31-
FILE1_COUNT = os.path.join(PATHS["data_phase"], "github_1_count.csv")
31+
FILE_COUNT = os.path.join(PATHS["data_phase"], "github_1_count.csv")
3232
GH_TOKEN = os.getenv("GH_TOKEN")
3333
# Also see: https://en.wikipedia.org/wiki/Public-domain-equivalent_license
3434
GITHUB_TOOLS = [
@@ -40,7 +40,7 @@
4040
{"TOOL_IDENTIFIER": "Unlicense", "SPDX_IDENTIFIER": "Unlicense"},
4141
{"TOOL_IDENTIFIER": "Total public repositories", "SPDX_IDENTIFIER": "N/A"},
4242
]
43-
HEADER1_COUNT = ["TOOL_IDENTIFIER", "SPDX_IDENTIFIER", "COUNT"]
43+
HEADER_COUNT = ["TOOL_IDENTIFIER", "SPDX_IDENTIFIER", "COUNT"]
4444
QUARTER = os.path.basename(PATHS["data_quarter"])
4545

4646

@@ -68,7 +68,7 @@ def parse_arguments():
6868

6969
def check_for_completion():
7070
try:
71-
with open(FILE1_COUNT, "r", newline="") as file_obj:
71+
with open(FILE_COUNT, "r", encoding="utf-8") as file_obj:
7272
reader = csv.DictReader(file_obj, dialect="unix")
7373
if len(list(reader)) == len(GITHUB_TOOLS):
7474
raise shared.QuantifyingException(
@@ -78,27 +78,6 @@ def check_for_completion():
7878
pass # File may not be found without --enable-save, etc.
7979

8080

81-
def write_data(args, tool_data):
82-
if not args.enable_save:
83-
return args
84-
85-
# Create data directory for this phase
86-
os.makedirs(PATHS["data_phase"], exist_ok=True)
87-
88-
if len(tool_data) < len(GITHUB_TOOLS):
89-
LOGGER.error("Unable to fetch all records. Aborting.")
90-
return args
91-
92-
with open(FILE1_COUNT, "w", encoding="utf-8", newline="\n") as file_obj:
93-
writer = csv.DictWriter(
94-
file_obj, fieldnames=HEADER1_COUNT, dialect="unix"
95-
)
96-
writer.writeheader()
97-
for row in tool_data:
98-
writer.writerow(row)
99-
return args
100-
101-
10281
def query_github(args, session):
10382
tool_data = []
10483
for tool in GITHUB_TOOLS:
@@ -148,7 +127,10 @@ def main():
148127
session.headers.update({"authorization": f"Bearer {GH_TOKEN}"})
149128

150129
tool_data = query_github(args, session)
151-
args = write_data(args, tool_data)
130+
if len(tool_data) < len(GITHUB_TOOLS):
131+
LOGGER.error("Unable to fetch all records. Aborting.")
132+
return args
133+
shared.rows_to_csv(args, FILE_COUNT, HEADER_COUNT, tool_data)
152134
args = shared.git_add_and_commit(
153135
args,
154136
PATHS["repo"],

scripts/1-fetch/openverse_fetch.py

Lines changed: 1 addition & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,6 @@
1313

1414
# Standard library
1515
import argparse
16-
import csv
1716
import os
1817
import sys
1918
import textwrap
@@ -192,27 +191,12 @@ def query_openverse(session):
192191
return aggregate
193192

194193

195-
def write_data(args, data):
196-
if not args.enable_save:
197-
return
198-
os.makedirs(PATHS["data_phase"], exist_ok=True)
199-
with open(FILE_PATH, "w", encoding="utf-8", newline="") as file_obj:
200-
writer = csv.DictWriter(
201-
file_obj,
202-
fieldnames=OPENVERSE_FIELDS,
203-
dialect="unix",
204-
)
205-
writer.writeheader()
206-
for row in data:
207-
writer.writerow(row)
208-
209-
210194
def main():
211195
args = parse_arguments()
212196
LOGGER.info("Starting Openverse Fetch Script...")
213197
session = shared.get_session(accept_header="application/json")
214198
records = query_openverse(session)
215-
write_data(args, records)
199+
shared.rows_to_csv(args, FILE_PATH, OPENVERSE_FIELDS, records)
216200
LOGGER.info(f"Fetched {len(records)} unique Openverse records.")
217201

218202

scripts/1-fetch/smithsonian_fetch.py

Lines changed: 4 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -74,15 +74,15 @@ def check_for_completion():
7474
completed_units = False
7575

7676
try:
77-
with open(FILE_1_METRICS, "r", newline="") as file_obj:
77+
with open(FILE_1_METRICS, "r", encoding="utf-8") as file_obj:
7878
reader = csv.DictReader(file_obj, dialect="unix")
7979
if len(list(reader)) > 0:
8080
completed_metrics = True
8181
except FileNotFoundError:
8282
pass # File may not be found without --enable-save, etc.
8383

8484
try:
85-
with open(FILE_2_UNITS, "r", newline="") as file_obj:
85+
with open(FILE_2_UNITS, "r", encoding="utf-8") as file_obj:
8686
reader = csv.DictReader(file_obj, dialect="unix")
8787
if len(list(reader)) > 30:
8888
completed_units = True
@@ -95,32 +95,6 @@ def check_for_completion():
9595
)
9696

9797

98-
def write_data(args, data_metrics, data_units):
99-
if not args.enable_save:
100-
return args
101-
102-
# Create data directory for this phase
103-
os.makedirs(PATHS["data_phase"], exist_ok=True)
104-
105-
with open(FILE_1_METRICS, "w", encoding="utf-8", newline="\n") as file_obj:
106-
writer = csv.DictWriter(
107-
file_obj, fieldnames=HEADER_1_METRICS, dialect="unix"
108-
)
109-
writer.writeheader()
110-
for row in data_metrics:
111-
writer.writerow(row)
112-
113-
with open(FILE_2_UNITS, "w", encoding="utf-8", newline="\n") as file_obj:
114-
writer = csv.DictWriter(
115-
file_obj, fieldnames=HEADER_2_UNITS, dialect="unix"
116-
)
117-
writer.writeheader()
118-
for row in data_units:
119-
writer.writerow(row)
120-
121-
return args
122-
123-
12498
def query_smithsonian(args, session):
12599
if not DATA_GOV_API_KEY:
126100
raise shared.QuantifyingException(
@@ -177,7 +151,8 @@ def main():
177151
check_for_completion()
178152
session = shared.get_session()
179153
data_metrics, data_units = query_smithsonian(args, session)
180-
args = write_data(args, data_metrics, data_units)
154+
shared.rows_to_csv(args, FILE_1_METRICS, HEADER_1_METRICS, data_metrics)
155+
shared.rows_to_csv(args, FILE_2_UNITS, HEADER_2_UNITS, data_units)
181156
args = shared.git_add_and_commit(
182157
args,
183158
PATHS["repo"],

scripts/1-fetch/wikipedia_fetch.py

Lines changed: 4 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,9 @@ def parse_arguments():
6565

6666
def check_for_completion():
6767
try:
68-
with open(FILE_LANGUAGES, "r", newline="") as file_obj:
68+
with open(
69+
FILE_LANGUAGES, "r", encoding="utf-8", newline=""
70+
) as file_obj:
6971
reader = csv.DictReader(file_obj, dialect="unix")
7072
if len(list(reader)) > 300:
7173
raise shared.QuantifyingException(
@@ -75,22 +77,6 @@ def check_for_completion():
7577
pass # File may not be found without --enable-save, etc.
7678

7779

78-
def write_data(args, tool_data):
79-
if not args.enable_save:
80-
return args
81-
LOGGER.info("Saving fetched data")
82-
os.makedirs(PATHS["data_phase"], exist_ok=True)
83-
84-
with open(FILE_LANGUAGES, "w", encoding="utf-8", newline="\n") as file_obj:
85-
writer = csv.DictWriter(
86-
file_obj, fieldnames=HEADER_LANGUAGES, dialect="unix"
87-
)
88-
writer.writeheader()
89-
for row in tool_data:
90-
writer.writerow(row)
91-
return args
92-
93-
9480
def query_wikipedia_languages(session):
9581
LOGGER.info("Fetching article counts from all language Wikipedias")
9682
tool_data = []
@@ -173,7 +159,7 @@ def main():
173159
shared.git_fetch_and_merge(args, PATHS["repo"])
174160
session = shared.get_session()
175161
tool_data = query_wikipedia_languages(session)
176-
args = write_data(args, tool_data)
162+
shared.rows_to_csv(args, FILE_LANGUAGES, HEADER_LANGUAGES, tool_data)
177163
args = shared.git_add_and_commit(
178164
args,
179165
PATHS["repo"],

0 commit comments

Comments
 (0)