@@ -82,15 +82,15 @@ def wikipedia_intro(args):
8282 "wikipedia_highest_language_usage.csv" ,
8383 )
8484 LOGGER .info (
85- f"Data file (top 10 languages ):"
85+ f"Data file (Most represented langauge ):"
8686 f"{ file_path_top10 .replace (PATHS ['repo' ], '.' )} "
8787 )
8888 name_label = "LANGUAGE_NAME_EN"
89- name_label_top10 = "language_name_en "
89+ name_label_top10 = "Language "
9090 data = pd .read_csv (file_path , index_col = name_label )
9191 total_articles = data ["COUNT" ].sum ()
9292 top10 = pd .read_csv (file_path_top10 , index_col = name_label_top10 )
93- top10_articles = top10 ["count " ].sum ()
93+ top10_articles = top10 ["Count " ].sum ()
9494 top10_percentage = (top10_articles / total_articles ) * 100
9595 average_articles = total_articles / len (data )
9696 language_count = len (data )
@@ -100,26 +100,71 @@ def wikipedia_intro(args):
100100 "Overview" ,
101101 None ,
102102 None ,
103- "This report provides insights into the usage"
104- " of the Creative Commons sharelike 4.0 across"
105- " the different language edition of Wikipedia."
106- " The wikipedia data, below, uses the `count`"
107- " field from the Wikipedia API to quantify the number of articles"
108- " in each language edition "
109- " of Wikipedia."
110- f"** The total number of Wikipedia articles across"
111- f"** { language_count } languages is"
112- f"** { total_articles :,} . The top 10 languages account for"
113- f"** { top10_articles :,} articles, which is"
114- f"** { top10_percentage :.2f} % of the total articles."
115- f"** The average number of articles per language is"
116- f"** { average_articles :,.2f} .**"
103+ "This report provides insights into the usage of the Creative Commons"
104+ " Attribution 4.0 International across the different language edition"
105+ " of Wikipedia. The wikipedia data, below, uses the `Count` field from"
106+ " the Wikipedia API to quantify the number of articles in"
107+ " each language edition of Wikipedia."
108+ "\n "
109+ f"**The total number of Wikipedia articles across"
110+ f" { language_count } languages is"
111+ f" { total_articles :,} . The top 10 languages account for"
112+ f" { top10_articles :,} articles, which is"
113+ f" { top10_percentage :,.2f} % of the total articles."
114+ f" The average number of articles per language is"
115+ f" { average_articles :,.2f} .**"
117116 "\n "
118117 "Thank you to Wikipedia and the Wikimedia Foundation for"
119118 " making this data publicly available!" ,
120119 )
121120
122121
122+ def plot_language_representation (args ):
123+ """
124+ Create plots showing language representation
125+ """
126+ LOGGER .info (plot_language_representation .__doc__ .strip ())
127+ file_path = shared .path_join (
128+ PATHS ["data_2-process" ],
129+ "wikipedia_language_representation.csv" ,
130+ )
131+ LOGGER .info (f"data file: { file_path .replace (PATHS ['repo' ], '.' )} " )
132+ name_label = "Category"
133+ data_label = "Count"
134+ data = pd .read_csv (file_path , index_col = name_label )
135+ data .sort_values (data_label , ascending = True , inplace = True )
136+ title = "Language Representation"
137+ plt = plot .combined_plot (
138+ args = args ,
139+ data = data ,
140+ title = title ,
141+ name_label = name_label ,
142+ data_label = data_label ,
143+ )
144+
145+ image_path = shared .path_join (
146+ PATHS ["data_phase" ], "wikipedia_language_representation.png"
147+ )
148+ LOGGER .info (f"image file: { image_path .replace (PATHS ['repo' ], '.' )} " )
149+
150+ if args .enable_save :
151+ # Create the directory if it does not exist
152+ os .makedirs (PATHS ["data_phase" ], exist_ok = True )
153+ plt .savefig (image_path )
154+
155+ shared .update_readme (
156+ args ,
157+ SECTION ,
158+ title ,
159+ image_path ,
160+ "Plots showing the language representation"
161+ " across different language editions of Wikipedia."
162+ " This shows how many languages are underrepresented"
163+ " (below average number of articles) versus"
164+ " represented (above average number of articles)." ,
165+ )
166+
167+
123168def plot_highest_language_usage (args ):
124169 """
125170 Create plots showing totals by license type
@@ -130,11 +175,11 @@ def plot_highest_language_usage(args):
130175 "wikipedia_highest_language_usage.csv" ,
131176 )
132177 LOGGER .info (f"data file: { file_path .replace (PATHS ['repo' ], '.' )} " )
133- name_label = "language_name_en "
134- data_label = "count "
178+ name_label = "Language "
179+ data_label = "Count "
135180 data = pd .read_csv (file_path , index_col = name_label )
136181 data .sort_values (data_label , ascending = True , inplace = True )
137- title = "Top 10 Highest Language Usage "
182+ title = "Most represented languages "
138183 plt = plot .combined_plot (
139184 args = args ,
140185 data = data ,
@@ -158,29 +203,29 @@ def plot_highest_language_usage(args):
158203 SECTION ,
159204 title ,
160205 image_path ,
161- "Plots showing the top 10 highest language usage"
206+ "Plots showing the most represented language usage"
162207 " across different language editions of Wikipedia."
163- " This shows which languages have the most articles under CC BY-SA 4.0 "
164- " in Wikipedia, highlighting the distribution of content "
165- " across languages ." ,
208+ " This shows which languages have the most articles"
209+ " under CC BY-SA 4.0 in Wikipedia, highlighting languages with higher "
210+ " langauge representation ." ,
166211 )
167212
168213
169- def plot_language_representation (args ):
214+ def plot_least_language_usage (args ):
170215 """
171- Create plots showing language representation
216+ Create plots showing totals by license type
172217 """
173- LOGGER .info (plot_language_representation .__doc__ .strip ())
218+ LOGGER .info (plot_least_language_usage .__doc__ .strip ())
174219 file_path = shared .path_join (
175220 PATHS ["data_2-process" ],
176- "wikipedia_language_representation .csv" ,
221+ "wikipedia_least_language_usage .csv" ,
177222 )
178223 LOGGER .info (f"data file: { file_path .replace (PATHS ['repo' ], '.' )} " )
179- name_label = "category "
180- data_label = "language_count "
224+ name_label = "Language "
225+ data_label = "Count "
181226 data = pd .read_csv (file_path , index_col = name_label )
182227 data .sort_values (data_label , ascending = True , inplace = True )
183- title = "Language Representation "
228+ title = "Least represented languages "
184229 plt = plot .combined_plot (
185230 args = args ,
186231 data = data ,
@@ -190,7 +235,7 @@ def plot_language_representation(args):
190235 )
191236
192237 image_path = shared .path_join (
193- PATHS ["data_phase" ], "wikipedia_language_representation .png"
238+ PATHS ["data_phase" ], "wikipedia_least_language_usage .png"
194239 )
195240 LOGGER .info (f"image file: { image_path .replace (PATHS ['repo' ], '.' )} " )
196241
@@ -204,11 +249,11 @@ def plot_language_representation(args):
204249 SECTION ,
205250 title ,
206251 image_path ,
207- "Plots showing the language representation "
252+ "Plots showing the least represented language usage "
208253 " across different language editions of Wikipedia."
209- " This shows how many languages are underrepresented "
210- " (below average number of articles) versus "
211- " represented (above average number of articles) ." ,
254+ " This shows which languages have the least articles "
255+ " under CC BY-SA 4.0 in Wikipedia, highlighting "
256+ " languages with lower content representation ." ,
212257 )
213258
214259
@@ -217,8 +262,9 @@ def main():
217262 shared .paths_log (LOGGER , PATHS )
218263 shared .git_fetch_and_merge (args , PATHS ["repo" ])
219264 wikipedia_intro (args )
220- plot_highest_language_usage (args )
221265 plot_language_representation (args )
266+ plot_highest_language_usage (args )
267+ plot_least_language_usage (args )
222268
223269 # Add and commit changes
224270 args = shared .git_add_and_commit (
0 commit comments