Skip to content

Commit 7cdb200

Browse files
committed
Made changes
1 parent 9c3c1d1 commit 7cdb200

2 files changed

Lines changed: 122 additions & 56 deletions

File tree

scripts/2-process/wikipedia_process.py

Lines changed: 39 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -71,25 +71,48 @@ def data_to_csv(args, data, file_path):
7171

7272
def process_highest_language_usage(args, count_data):
7373
"""
74-
Processing count data: top 10 highest language usage
74+
Processing count data: Most represented languages
7575
"""
7676
LOGGER.info(process_highest_language_usage.__doc__.strip())
7777
data = {}
7878

7979
for row in count_data.itertuples(index=False):
80-
language_name_en = row.LANGUAGE_NAME_EN
81-
count = row.COUNT
82-
data[language_name_en] = count
80+
Language = row.LANGUAGE_NAME_EN
81+
Count = row.COUNT
82+
data[Language] = Count
8383

84-
data = pd.DataFrame(data.items(), columns=["language_name_en", "count"])
85-
data.sort_values("count", ascending=False, inplace=True)
84+
data = pd.DataFrame(data.items(), columns=["Language", "Count"])
85+
data.sort_values("Count", ascending=False, inplace=True)
8686
top_10 = data.head(10)
8787
file_path = shared.path_join(
8888
PATHS["data_phase"], "wikipedia_highest_language_usage.csv"
8989
)
9090
data_to_csv(args, top_10, file_path)
9191

9292

93+
def process_least_language_usage(args, count_data):
94+
"""
95+
Processing count data: Least represented languages
96+
"""
97+
LOGGER.info(process_least_language_usage.__doc__.strip())
98+
data = {}
99+
100+
for row in count_data.itertuples(index=False):
101+
Language = row.LANGUAGE_NAME_EN
102+
Count = row.COUNT
103+
104+
if Count >= 1:
105+
data[Language] = Count
106+
107+
data = pd.DataFrame(data.items(), columns=["Language", "Count"])
108+
data.sort_values("Count", ascending=True, inplace=True)
109+
bottom_10 = data.head(10)
110+
file_path = shared.path_join(
111+
PATHS["data_phase"], "wikipedia_least_language_usage.csv"
112+
)
113+
data_to_csv(args, bottom_10, file_path)
114+
115+
93116
def process_language_representation(args, count_data):
94117
"""
95118
Processing count data: language representation
@@ -98,22 +121,18 @@ def process_language_representation(args, count_data):
98121
data = {}
99122

100123
for row in count_data.itertuples(index=False):
101-
language_name_en = row.LANGUAGE_NAME_EN
102-
count = row.COUNT
103-
data[language_name_en] = count
124+
Language = row.LANGUAGE_NAME_EN
125+
Count = row.COUNT
126+
data[Language] = Count
104127

105-
data = pd.DataFrame(data.items(), columns=["language_name_en", "count"])
106-
average_count = data["count"].mean()
128+
data = pd.DataFrame(data.items(), columns=["Language", "Count"])
129+
average_count = data["Count"].mean()
107130

108-
data["category"] = data["count"].apply(
131+
data["Category"] = data["Count"].apply(
109132
lambda x: "Underrepresented" if x < average_count else "Represented"
110133
)
111-
language_counts = (
112-
data.groupby("category").size().reset_index(name="language_count")
113-
)
114-
language_counts.sort_values(
115-
"language_count", ascending=False, inplace=True
116-
)
134+
language_counts = data.groupby("Category").size().reset_index(name="Count")
135+
language_counts.sort_values("Count", ascending=False, inplace=True)
117136
file_path = shared.path_join(
118137
PATHS["data_phase"], "wikipedia_language_representation.csv"
119138
)
@@ -129,8 +148,9 @@ def main():
129148
PATHS["data_1-fetch"], "wikipedia_count_by_languages.csv"
130149
)
131150
count_data = pd.read_csv(file_count, usecols=["LANGUAGE_NAME_EN", "COUNT"])
132-
process_highest_language_usage(args, count_data)
133151
process_language_representation(args, count_data)
152+
process_highest_language_usage(args, count_data)
153+
process_least_language_usage(args, count_data)
134154

135155
# Push changes
136156
args = shared.git_add_and_commit(

scripts/3-report/wikipedia_report.py

Lines changed: 83 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -82,15 +82,15 @@ def wikipedia_intro(args):
8282
"wikipedia_highest_language_usage.csv",
8383
)
8484
LOGGER.info(
85-
f"Data file (top 10 languages):"
85+
f"Data file (Most represented langauge):"
8686
f"{file_path_top10.replace(PATHS['repo'], '.')}"
8787
)
8888
name_label = "LANGUAGE_NAME_EN"
89-
name_label_top10 = "language_name_en"
89+
name_label_top10 = "Language"
9090
data = pd.read_csv(file_path, index_col=name_label)
9191
total_articles = data["COUNT"].sum()
9292
top10 = pd.read_csv(file_path_top10, index_col=name_label_top10)
93-
top10_articles = top10["count"].sum()
93+
top10_articles = top10["Count"].sum()
9494
top10_percentage = (top10_articles / total_articles) * 100
9595
average_articles = total_articles / len(data)
9696
language_count = len(data)
@@ -100,26 +100,71 @@ def wikipedia_intro(args):
100100
"Overview",
101101
None,
102102
None,
103-
"This report provides insights into the usage"
104-
" of the Creative Commons sharelike 4.0 across"
105-
" the different language edition of Wikipedia."
106-
" The wikipedia data, below, uses the `count`"
107-
" field from the Wikipedia API to quantify the number of articles"
108-
" in each language edition "
109-
" of Wikipedia."
110-
f"** The total number of Wikipedia articles across"
111-
f"** {language_count} languages is"
112-
f"** {total_articles:,}. The top 10 languages account for"
113-
f"** {top10_articles:,} articles, which is"
114-
f"** {top10_percentage:.2f}% of the total articles."
115-
f"** The average number of articles per language is"
116-
f"** {average_articles:,.2f}.**"
103+
"This report provides insights into the usage of the Creative Commons"
104+
" Attribution 4.0 International across the different language edition"
105+
" of Wikipedia. The wikipedia data, below, uses the `Count` field from"
106+
" the Wikipedia API to quantify the number of articles in"
107+
" each language edition of Wikipedia."
108+
"\n"
109+
f"**The total number of Wikipedia articles across"
110+
f" {language_count} languages is"
111+
f" {total_articles:,}. The top 10 languages account for"
112+
f" {top10_articles:,} articles, which is"
113+
f" {top10_percentage:,.2f}% of the total articles."
114+
f" The average number of articles per language is"
115+
f" {average_articles:,.2f}.**"
117116
"\n"
118117
"Thank you to Wikipedia and the Wikimedia Foundation for"
119118
" making this data publicly available!",
120119
)
121120

122121

122+
def plot_language_representation(args):
123+
"""
124+
Create plots showing language representation
125+
"""
126+
LOGGER.info(plot_language_representation.__doc__.strip())
127+
file_path = shared.path_join(
128+
PATHS["data_2-process"],
129+
"wikipedia_language_representation.csv",
130+
)
131+
LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}")
132+
name_label = "Category"
133+
data_label = "Count"
134+
data = pd.read_csv(file_path, index_col=name_label)
135+
data.sort_values(data_label, ascending=True, inplace=True)
136+
title = "Language Representation"
137+
plt = plot.combined_plot(
138+
args=args,
139+
data=data,
140+
title=title,
141+
name_label=name_label,
142+
data_label=data_label,
143+
)
144+
145+
image_path = shared.path_join(
146+
PATHS["data_phase"], "wikipedia_language_representation.png"
147+
)
148+
LOGGER.info(f"image file: {image_path.replace(PATHS['repo'], '.')}")
149+
150+
if args.enable_save:
151+
# Create the directory if it does not exist
152+
os.makedirs(PATHS["data_phase"], exist_ok=True)
153+
plt.savefig(image_path)
154+
155+
shared.update_readme(
156+
args,
157+
SECTION,
158+
title,
159+
image_path,
160+
"Plots showing the language representation"
161+
" across different language editions of Wikipedia."
162+
" This shows how many languages are underrepresented"
163+
" (below average number of articles) versus"
164+
" represented (above average number of articles).",
165+
)
166+
167+
123168
def plot_highest_language_usage(args):
124169
"""
125170
Create plots showing totals by license type
@@ -130,11 +175,11 @@ def plot_highest_language_usage(args):
130175
"wikipedia_highest_language_usage.csv",
131176
)
132177
LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}")
133-
name_label = "language_name_en"
134-
data_label = "count"
178+
name_label = "Language"
179+
data_label = "Count"
135180
data = pd.read_csv(file_path, index_col=name_label)
136181
data.sort_values(data_label, ascending=True, inplace=True)
137-
title = "Top 10 Highest Language Usage"
182+
title = "Most represented languages"
138183
plt = plot.combined_plot(
139184
args=args,
140185
data=data,
@@ -158,29 +203,29 @@ def plot_highest_language_usage(args):
158203
SECTION,
159204
title,
160205
image_path,
161-
"Plots showing the top 10 highest language usage"
206+
"Plots showing the most represented language usage"
162207
" across different language editions of Wikipedia."
163-
" This shows which languages have the most articles under CC BY-SA 4.0"
164-
" in Wikipedia, highlighting the distribution of content"
165-
" across languages.",
208+
" This shows which languages have the most articles"
209+
" under CC BY-SA 4.0 in Wikipedia, highlighting languages with higher"
210+
" langauge representation.",
166211
)
167212

168213

169-
def plot_language_representation(args):
214+
def plot_least_language_usage(args):
170215
"""
171-
Create plots showing language representation
216+
Create plots showing totals by license type
172217
"""
173-
LOGGER.info(plot_language_representation.__doc__.strip())
218+
LOGGER.info(plot_least_language_usage.__doc__.strip())
174219
file_path = shared.path_join(
175220
PATHS["data_2-process"],
176-
"wikipedia_language_representation.csv",
221+
"wikipedia_least_language_usage.csv",
177222
)
178223
LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}")
179-
name_label = "category"
180-
data_label = "language_count"
224+
name_label = "Language"
225+
data_label = "Count"
181226
data = pd.read_csv(file_path, index_col=name_label)
182227
data.sort_values(data_label, ascending=True, inplace=True)
183-
title = "Language Representation"
228+
title = "Least represented languages"
184229
plt = plot.combined_plot(
185230
args=args,
186231
data=data,
@@ -190,7 +235,7 @@ def plot_language_representation(args):
190235
)
191236

192237
image_path = shared.path_join(
193-
PATHS["data_phase"], "wikipedia_language_representation.png"
238+
PATHS["data_phase"], "wikipedia_least_language_usage.png"
194239
)
195240
LOGGER.info(f"image file: {image_path.replace(PATHS['repo'], '.')}")
196241

@@ -204,11 +249,11 @@ def plot_language_representation(args):
204249
SECTION,
205250
title,
206251
image_path,
207-
"Plots showing the language representation"
252+
"Plots showing the least represented language usage"
208253
" across different language editions of Wikipedia."
209-
" This shows how many languages are underrepresented"
210-
" (below average number of articles) versus"
211-
" represented (above average number of articles).",
254+
" This shows which languages have the least articles"
255+
" under CC BY-SA 4.0 in Wikipedia, highlighting"
256+
" languages with lower content representation.",
212257
)
213258

214259

@@ -217,8 +262,9 @@ def main():
217262
shared.paths_log(LOGGER, PATHS)
218263
shared.git_fetch_and_merge(args, PATHS["repo"])
219264
wikipedia_intro(args)
220-
plot_highest_language_usage(args)
221265
plot_language_representation(args)
266+
plot_highest_language_usage(args)
267+
plot_least_language_usage(args)
222268

223269
# Add and commit changes
224270
args = shared.git_add_and_commit(

0 commit comments

Comments
 (0)