55import sys
66import textwrap
77import traceback
8+ from copy import deepcopy
89from types import SimpleNamespace
910
1011# Third-party
2627LOGGER .info ("Script execution started" )
2728
2829
29- def assign_tool_parts (url ):
30- tool = SimpleNamespace ()
31- tool .url = url
32- dirs = url .strip ().split ("/" )[3 :]
33- tool .category = dirs [0 ]
34- tool .unit = dirs [1 ]
35- if len (dirs ) > 2 :
36- tool .version = dirs [2 ]
37- tool .ver_sort = 999 - int (float (dirs [2 ]) * 10 )
38- else :
39- tool .version = None
40- tool .ver_sort = 999
41- if len (dirs ) == 4 :
42- tool .jurisdiction = dirs [3 ]
43- tool .jur_sort = dirs [3 ]
44- else :
45- tool .jurisdiction = None
30+ def sort_tools (tool ):
31+ tool .ver_sort = 999 - int (float (tool .version ) * 10 )
32+ if tool .jurisdiction == "" :
4633 tool .jur_sort = "A"
47-
48- # Identifier code is based on CC Legal Tools application:
49- # https://github.com/creativecommons/cc-legal-tools-app/blob/c3ac573871c7e20517539851de16998307f20d78/legal_tools/models.py#L677-L694
50- if tool .version is not None :
51- tool .identifier = f"{ tool .unit } { tool .version } "
5234 else :
53- tool .identifier = f"{ tool .unit } "
54-
55- if tool .unit == "mark" :
56- tool .identifier = f"PDM { tool .version } "
57- elif tool .unit == "zero" :
58- tool .identifier = f"CC0 { tool .version } "
59- elif tool .category == "licenses" :
60- tool .identifier = f"CC { tool .identifier } "
61-
62- if tool .jurisdiction :
63- tool .identifier = f"{ tool .identifier } { tool .jurisdiction } "
64- tool .identifier = tool .identifier .upper ()
65-
66- return tool
67-
68-
69- def sort_tools (url ):
70- tool = assign_tool_parts (url )
35+ tool .jur_sort = tool .jurisdiction
7136 # Priority 1: 4.0 licenses
7237 if tool .category == "licenses" and tool .version == "4.0" :
7338 priority = 1
@@ -78,32 +43,32 @@ def sort_tools(url):
7843 elif (
7944 tool .category == "licenses"
8045 and tool .unit .startswith ("by" )
81- and tool .version is not None
82- and tool .jurisdiction is None
46+ and tool .version != ""
47+ and tool .jurisdiction == ""
8348 ):
8449 priority = 3
8550 # Priority 4: ported 1.0-3.0 by* licenses
8651 elif (
8752 tool .category == "licenses"
8853 and tool .unit .startswith ("by" )
89- and tool .version is not None
90- and tool .jurisdiction is not None
54+ and tool .version != ""
55+ and tool .jurisdiction != ""
9156 ):
9257 priority = 4
9358 # Priority 5: unported 1.0-3.0 non-by* licenses
9459 elif (
9560 tool .category == "licenses"
9661 and not tool .unit .startswith ("by" )
97- and tool .version is not None
98- and tool .jurisdiction is None
62+ and tool .version != ""
63+ and tool .jurisdiction == ""
9964 ):
10065 priority = 5
10166 # Priority 6: ported 1.0-3.0 non-by* licenses
10267 elif (
10368 tool .category == "licenses"
10469 and not tool .unit .startswith ("by" )
105- and tool .version is not None
106- and tool .jurisdiction is not None
70+ and tool .version != ""
71+ and tool .jurisdiction != ""
10772 ):
10873 priority = 6
10974 # Priority 7: miscellaneous
@@ -112,17 +77,29 @@ def sort_tools(url):
11277 return f"{ priority } -{ tool .ver_sort } -{ tool .jur_sort } -{ tool .unit } "
11378
11479
115- def get_tool_urls ():
116- LOGGER .info ("Loading CC Legal Tool paths and adding prefix" )
117- file_path = shared .path_join (PATHS ["data" ], "legal-tool-paths.txt" )
118- prefix = "//creativecommons.org/"
119- tool_urls = []
80+ def get_tools_metadata_namespace ():
81+ LOGGER .info ("Loading CC Legal Tool metadata" )
82+ file_path = shared .path_join (PATHS ["data" ], "cc-legal-tools.csv" )
83+ tools_metadata = []
12084 with open (file_path , "r" , encoding = "utf-8" ) as file_obj :
121- for line in file_obj :
122- tool_urls .append (f"{ prefix } { line .strip ()} " )
123- LOGGER .info ("Prioritizing CC Legal Tool URLs" )
124- tool_urls .sort (key = sort_tools )
125- return tool_urls
85+ rows = csv .DictReader (file_obj , dialect = "unix" )
86+ for row in rows :
87+ tool = SimpleNamespace ()
88+ for key , value in row .items ():
89+ setattr (tool , key .lower (), value )
90+ tool .canonical_url = tool .canonical_url .replace ("https:" , "" )
91+ tool .canonical_url = tool .canonical_url .rstrip ("/" )
92+ tools_metadata .append (tool )
93+ # Add tool with legacy URL for CERTIFICATION 1.0 US
94+ if tool .identifier == "CERTIFICATION 1.0 US" :
95+ legacy_tool = deepcopy (tool )
96+ legacy_tool .canonical_url = (
97+ "//creativecommons.org/licenses/publicdomain"
98+ )
99+ tools_metadata .append (legacy_tool )
100+ LOGGER .info ("Prioritizing CC Legal Tool metadata entries" )
101+ tools_metadata .sort (key = sort_tools )
102+ return tools_metadata
126103
127104
128105def load_countries ():
@@ -139,61 +116,59 @@ def load_languages():
139116 return languages
140117
141118
142- def create_query_plan (tool_urls , countries , languages ):
143- tool_data = {}
144- for url in tool_urls :
145- tool = assign_tool_parts (url )
146- tool_data [tool .identifier ] = tool
147-
119+ def create_query_plan (tools_metadata , countries , languages ):
148120 plan = []
149-
150121 # ideal: all tools, all countries, all languages: 5,522,440
151122
152123 # cr: Google Country Collection value
153124 # lr: Google Language Collection value
154125
155- # Group 1: All tools without cr or lr
156- # subtotal: 652
157- for identifier , tool in tool_data .items ():
158- plan .append ({"TOOL_URL" : tool .url , "TOOL_IDENTIFIER" : identifier })
126+ # Group 1: All tools (without country or language) is 640
127+ for tool in tools_metadata :
128+ plan .append (
129+ {
130+ "TOOL_URL" : tool .canonical_url ,
131+ "TOOL_IDENTIFIER" : tool .identifier ,
132+ }
133+ )
159134
160- # Group 2: 4.0 licenses (6) by language (35)
161- # CC0 (1) by language (35)
162- # PDM (1) by language (35)
163- # subtotal: 280
164- for identifier , tool in tool_data . items () :
135+ # Group 2: 4.0 licenses (6) by language (35) is 210
136+ # CC0 (1) by language (35) .........is 35
137+ # PDM (1) by language (35) .........is 35
138+ # ......................... ..subtotal 280
139+ for tool in tools_metadata :
165140 if (
166141 tool .category == "licenses" and tool .version == "4.0"
167142 ) or tool .unit in ("mark" , "zero" ):
168143 for pair in languages :
169144 plan .append (
170145 {
171- "TOOL_URL" : tool .url ,
172- "TOOL_IDENTIFIER" : identifier ,
146+ "TOOL_URL" : tool .canonical_url ,
147+ "TOOL_IDENTIFIER" : tool . identifier ,
173148 "LANGUAGE" : pair ["language" ],
174149 "LR" : pair ["lr" ],
175150 }
176151 )
177152
178- # Group 3: 4.0 licenses (6) by country (242)
179- # CC0 (1) by country (242)
180- # PDM (1) by country (242)
181- # subtotal: 1,936
182- for identifier , tool in tool_data . items () :
153+ # Group 3: 4.0 licenses (6) by country (242) is 1,452
154+ # CC0 (1) by country (242)..........is 242
155+ # PDM (1) by country (242)..........is 242
156+ # ............................ subtotal 1,936
157+ for tool in tools_metadata :
183158 if (
184159 tool .category == "licenses" and tool .version == "4.0"
185160 ) or tool .unit in ("mark" , "zero" ):
186161 for pair in countries :
187162 plan .append (
188163 {
189- "TOOL_URL" : tool .url ,
190- "TOOL_IDENTIFIER" : identifier ,
164+ "TOOL_URL" : tool .canonical_url ,
165+ "TOOL_IDENTIFIER" : tool . identifier ,
191166 "COUNTRY" : pair ["country" ],
192167 "CR" : pair ["cr" ],
193168 }
194169 )
195170
196- # plan total: 2,868
171+ # plan total: 2,856
197172 LOGGER .info (f"Plan entries: { len (plan )} " )
198173 return plan
199174
@@ -219,10 +194,10 @@ def save_plan(plan):
219194
220195
221196def main ():
222- tool_urls = get_tool_urls ()
197+ tools_metadata = get_tools_metadata_namespace ()
223198 countries = load_countries ()
224199 languages = load_languages ()
225- plan = create_query_plan (tool_urls , countries , languages )
200+ plan = create_query_plan (tools_metadata , countries , languages )
226201 save_plan (plan )
227202
228203
0 commit comments