Skip to content

Commit d98681f

Browse files
committed
update gcs query plan to use cc-legal-tools.csv
1 parent 5754535 commit d98681f

File tree

2 files changed

+63
-100
lines changed

2 files changed

+63
-100
lines changed

data/gcs_query_plan.csv

Lines changed: 2 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -19,18 +19,11 @@
1919
"//creativecommons.org/licenses/by-nc-sa/2.5","CC BY-NC-SA 2.5","","","",""
2020
"//creativecommons.org/licenses/by-nd/2.5","CC BY-ND 2.5","","","",""
2121
"//creativecommons.org/licenses/by-sa/2.5","CC BY-SA 2.5","","","",""
22-
"//creativecommons.org/licenses/by/2.1","CC BY 2.1","","","",""
23-
"//creativecommons.org/licenses/by-nc/2.1","CC BY-NC 2.1","","","",""
24-
"//creativecommons.org/licenses/by-nc-nd/2.1","CC BY-NC-ND 2.1","","","",""
25-
"//creativecommons.org/licenses/by-nc-sa/2.1","CC BY-NC-SA 2.1","","","",""
26-
"//creativecommons.org/licenses/by-nd/2.1","CC BY-ND 2.1","","","",""
27-
"//creativecommons.org/licenses/by-sa/2.1","CC BY-SA 2.1","","","",""
2822
"//creativecommons.org/licenses/by/2.0","CC BY 2.0","","","",""
2923
"//creativecommons.org/licenses/by-nc/2.0","CC BY-NC 2.0","","","",""
3024
"//creativecommons.org/licenses/by-nc-nd/2.0","CC BY-NC-ND 2.0","","","",""
3125
"//creativecommons.org/licenses/by-nc-sa/2.0","CC BY-NC-SA 2.0","","","",""
3226
"//creativecommons.org/licenses/by-nd/2.0","CC BY-ND 2.0","","","",""
33-
"//creativecommons.org/licenses/by-nd-nc/2.0","CC BY-ND-NC 2.0","","","",""
3427
"//creativecommons.org/licenses/by-sa/2.0","CC BY-SA 2.0","","","",""
3528
"//creativecommons.org/licenses/by/1.0","CC BY 1.0","","","",""
3629
"//creativecommons.org/licenses/by-nc/1.0","CC BY-NC 1.0","","","",""
@@ -559,7 +552,6 @@
559552
"//creativecommons.org/licenses/by-nc-nd/2.0/jp","CC BY-NC-ND 2.0 JP","","","",""
560553
"//creativecommons.org/licenses/by-nc-sa/2.0/jp","CC BY-NC-SA 2.0 JP","","","",""
561554
"//creativecommons.org/licenses/by-nd/2.0/jp","CC BY-ND 2.0 JP","","","",""
562-
"//creativecommons.org/licenses/by-nd-nc/2.0/jp","CC BY-ND-NC 2.0 JP","","","",""
563555
"//creativecommons.org/licenses/by-sa/2.0/jp","CC BY-SA 2.0 JP","","","",""
564556
"//creativecommons.org/licenses/by/2.0/kr","CC BY 2.0 KR","","","",""
565557
"//creativecommons.org/licenses/by-nc/2.0/kr","CC BY-NC 2.0 KR","","","",""
@@ -616,11 +608,6 @@
616608
"//creativecommons.org/licenses/by-nd-nc/1.0/nl","CC BY-ND-NC 1.0 NL","","","",""
617609
"//creativecommons.org/licenses/by-sa/1.0/nl","CC BY-SA 1.0 NL","","","",""
618610
"//creativecommons.org/licenses/devnations/2.0","CC DEVNATIONS 2.0","","","",""
619-
"//creativecommons.org/licenses/nc/2.0","CC NC 2.0","","","",""
620-
"//creativecommons.org/licenses/nc-sa/2.0","CC NC-SA 2.0","","","",""
621-
"//creativecommons.org/licenses/nd/2.0","CC ND 2.0","","","",""
622-
"//creativecommons.org/licenses/nd-nc/2.0","CC ND-NC 2.0","","","",""
623-
"//creativecommons.org/licenses/sa/2.0","CC SA 2.0","","","",""
624611
"//creativecommons.org/licenses/nc/1.0","CC NC 1.0","","","",""
625612
"//creativecommons.org/licenses/nc-sa/1.0","CC NC-SA 1.0","","","",""
626613
"//creativecommons.org/licenses/nc-sampling+/1.0","CC NC-SAMPLING+ 1.0","","","",""
@@ -650,7 +637,8 @@
650637
"//creativecommons.org/licenses/nc-sampling+/1.0/tw","CC NC-SAMPLING+ 1.0 TW","","","",""
651638
"//creativecommons.org/licenses/sampling/1.0/tw","CC SAMPLING 1.0 TW","","","",""
652639
"//creativecommons.org/licenses/sampling+/1.0/tw","CC SAMPLING+ 1.0 TW","","","",""
653-
"//creativecommons.org/licenses/publicdomain","CC PUBLICDOMAIN","","","",""
640+
"//creativecommons.org/publicdomain/certification/1.0/us","CERTIFICATION 1.0 US","","","",""
641+
"//creativecommons.org/licenses/publicdomain","CERTIFICATION 1.0 US","","","",""
654642
"//creativecommons.org/licenses/by/4.0","CC BY 4.0","","","Arabic","lang_ar"
655643
"//creativecommons.org/licenses/by/4.0","CC BY 4.0","","","Bulgarian","lang_bg"
656644
"//creativecommons.org/licenses/by/4.0","CC BY 4.0","","","Catalan","lang_ca"

dev/create_gcs_query_plan.py

Lines changed: 61 additions & 86 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
import sys
66
import textwrap
77
import traceback
8+
from copy import deepcopy
89
from types import SimpleNamespace
910

1011
# Third-party
@@ -26,48 +27,12 @@
2627
LOGGER.info("Script execution started")
2728

2829

29-
def assign_tool_parts(url):
30-
tool = SimpleNamespace()
31-
tool.url = url
32-
dirs = url.strip().split("/")[3:]
33-
tool.category = dirs[0]
34-
tool.unit = dirs[1]
35-
if len(dirs) > 2:
36-
tool.version = dirs[2]
37-
tool.ver_sort = 999 - int(float(dirs[2]) * 10)
38-
else:
39-
tool.version = None
40-
tool.ver_sort = 999
41-
if len(dirs) == 4:
42-
tool.jurisdiction = dirs[3]
43-
tool.jur_sort = dirs[3]
44-
else:
45-
tool.jurisdiction = None
30+
def sort_tools(tool):
31+
tool.ver_sort = 999 - int(float(tool.version) * 10)
32+
if tool.jurisdiction == "":
4633
tool.jur_sort = "A"
47-
48-
# Identifier code is based on CC Legal Tools application:
49-
# https://github.com/creativecommons/cc-legal-tools-app/blob/c3ac573871c7e20517539851de16998307f20d78/legal_tools/models.py#L677-L694
50-
if tool.version is not None:
51-
tool.identifier = f"{tool.unit} {tool.version}"
5234
else:
53-
tool.identifier = f"{tool.unit}"
54-
55-
if tool.unit == "mark":
56-
tool.identifier = f"PDM {tool.version}"
57-
elif tool.unit == "zero":
58-
tool.identifier = f"CC0 {tool.version}"
59-
elif tool.category == "licenses":
60-
tool.identifier = f"CC {tool.identifier}"
61-
62-
if tool.jurisdiction:
63-
tool.identifier = f"{tool.identifier} {tool.jurisdiction}"
64-
tool.identifier = tool.identifier.upper()
65-
66-
return tool
67-
68-
69-
def sort_tools(url):
70-
tool = assign_tool_parts(url)
35+
tool.jur_sort = tool.jurisdiction
7136
# Priority 1: 4.0 licenses
7237
if tool.category == "licenses" and tool.version == "4.0":
7338
priority = 1
@@ -78,32 +43,32 @@ def sort_tools(url):
7843
elif (
7944
tool.category == "licenses"
8045
and tool.unit.startswith("by")
81-
and tool.version is not None
82-
and tool.jurisdiction is None
46+
and tool.version != ""
47+
and tool.jurisdiction == ""
8348
):
8449
priority = 3
8550
# Priority 4: ported 1.0-3.0 by* licenses
8651
elif (
8752
tool.category == "licenses"
8853
and tool.unit.startswith("by")
89-
and tool.version is not None
90-
and tool.jurisdiction is not None
54+
and tool.version != ""
55+
and tool.jurisdiction != ""
9156
):
9257
priority = 4
9358
# Priority 5: unported 1.0-3.0 non-by* licenses
9459
elif (
9560
tool.category == "licenses"
9661
and not tool.unit.startswith("by")
97-
and tool.version is not None
98-
and tool.jurisdiction is None
62+
and tool.version != ""
63+
and tool.jurisdiction == ""
9964
):
10065
priority = 5
10166
# Priority 6: ported 1.0-3.0 non-by* licenses
10267
elif (
10368
tool.category == "licenses"
10469
and not tool.unit.startswith("by")
105-
and tool.version is not None
106-
and tool.jurisdiction is not None
70+
and tool.version != ""
71+
and tool.jurisdiction != ""
10772
):
10873
priority = 6
10974
# Priority 7: miscellaneous
@@ -112,17 +77,29 @@ def sort_tools(url):
11277
return f"{priority}-{tool.ver_sort}-{tool.jur_sort}-{tool.unit}"
11378

11479

115-
def get_tool_urls():
116-
LOGGER.info("Loading CC Legal Tool paths and adding prefix")
117-
file_path = shared.path_join(PATHS["data"], "legal-tool-paths.txt")
118-
prefix = "//creativecommons.org/"
119-
tool_urls = []
80+
def get_tools_metadata_namespace():
81+
LOGGER.info("Loading CC Legal Tool metadata")
82+
file_path = shared.path_join(PATHS["data"], "cc-legal-tools.csv")
83+
tools_metadata = []
12084
with open(file_path, "r", encoding="utf-8") as file_obj:
121-
for line in file_obj:
122-
tool_urls.append(f"{prefix}{line.strip()}")
123-
LOGGER.info("Prioritizing CC Legal Tool URLs")
124-
tool_urls.sort(key=sort_tools)
125-
return tool_urls
85+
rows = csv.DictReader(file_obj, dialect="unix")
86+
for row in rows:
87+
tool = SimpleNamespace()
88+
for key, value in row.items():
89+
setattr(tool, key.lower(), value)
90+
tool.canonical_url = tool.canonical_url.replace("https:", "")
91+
tool.canonical_url = tool.canonical_url.rstrip("/")
92+
tools_metadata.append(tool)
93+
# Add tool with legacy URL for CERTIFICATION 1.0 US
94+
if tool.identifier == "CERTIFICATION 1.0 US":
95+
legacy_tool = deepcopy(tool)
96+
legacy_tool.canonical_url = (
97+
"//creativecommons.org/licenses/publicdomain"
98+
)
99+
tools_metadata.append(legacy_tool)
100+
LOGGER.info("Prioritizing CC Legal Tool metadata entries")
101+
tools_metadata.sort(key=sort_tools)
102+
return tools_metadata
126103

127104

128105
def load_countries():
@@ -139,61 +116,59 @@ def load_languages():
139116
return languages
140117

141118

142-
def create_query_plan(tool_urls, countries, languages):
143-
tool_data = {}
144-
for url in tool_urls:
145-
tool = assign_tool_parts(url)
146-
tool_data[tool.identifier] = tool
147-
119+
def create_query_plan(tools_metadata, countries, languages):
148120
plan = []
149-
150121
# ideal: all tools, all countries, all languages: 5,522,440
151122

152123
# cr: Google Country Collection value
153124
# lr: Google Language Collection value
154125

155-
# Group 1: All tools without cr or lr
156-
# subtotal: 652
157-
for identifier, tool in tool_data.items():
158-
plan.append({"TOOL_URL": tool.url, "TOOL_IDENTIFIER": identifier})
126+
# Group 1: All tools (without country or language) is 640
127+
for tool in tools_metadata:
128+
plan.append(
129+
{
130+
"TOOL_URL": tool.canonical_url,
131+
"TOOL_IDENTIFIER": tool.identifier,
132+
}
133+
)
159134

160-
# Group 2: 4.0 licenses (6) by language (35)
161-
# CC0 (1) by language (35)
162-
# PDM (1) by language (35)
163-
# subtotal: 280
164-
for identifier, tool in tool_data.items():
135+
# Group 2: 4.0 licenses (6) by language (35) is 210
136+
# CC0 (1) by language (35) .........is 35
137+
# PDM (1) by language (35) .........is 35
138+
# ......................... ..subtotal 280
139+
for tool in tools_metadata:
165140
if (
166141
tool.category == "licenses" and tool.version == "4.0"
167142
) or tool.unit in ("mark", "zero"):
168143
for pair in languages:
169144
plan.append(
170145
{
171-
"TOOL_URL": tool.url,
172-
"TOOL_IDENTIFIER": identifier,
146+
"TOOL_URL": tool.canonical_url,
147+
"TOOL_IDENTIFIER": tool.identifier,
173148
"LANGUAGE": pair["language"],
174149
"LR": pair["lr"],
175150
}
176151
)
177152

178-
# Group 3: 4.0 licenses (6) by country (242)
179-
# CC0 (1) by country (242)
180-
# PDM (1) by country (242)
181-
# subtotal: 1,936
182-
for identifier, tool in tool_data.items():
153+
# Group 3: 4.0 licenses (6) by country (242) is 1,452
154+
# CC0 (1) by country (242)..........is 242
155+
# PDM (1) by country (242)..........is 242
156+
# ............................subtotal 1,936
157+
for tool in tools_metadata:
183158
if (
184159
tool.category == "licenses" and tool.version == "4.0"
185160
) or tool.unit in ("mark", "zero"):
186161
for pair in countries:
187162
plan.append(
188163
{
189-
"TOOL_URL": tool.url,
190-
"TOOL_IDENTIFIER": identifier,
164+
"TOOL_URL": tool.canonical_url,
165+
"TOOL_IDENTIFIER": tool.identifier,
191166
"COUNTRY": pair["country"],
192167
"CR": pair["cr"],
193168
}
194169
)
195170

196-
# plan total: 2,868
171+
# plan total: 2,856
197172
LOGGER.info(f"Plan entries: {len(plan)}")
198173
return plan
199174

@@ -219,10 +194,10 @@ def save_plan(plan):
219194

220195

221196
def main():
222-
tool_urls = get_tool_urls()
197+
tools_metadata = get_tools_metadata_namespace()
223198
countries = load_countries()
224199
languages = load_languages()
225-
plan = create_query_plan(tool_urls, countries, languages)
200+
plan = create_query_plan(tools_metadata, countries, languages)
226201
save_plan(plan)
227202

228203

0 commit comments

Comments
 (0)