Made changes

oree-xx · oree-xx · commit 7cdb20045e73 · 2025-11-15T07:00:07.000Z
diff --git a/scripts/2-process/wikipedia_process.py b/scripts/2-process/wikipedia_process.py
@@ -71,25 +71,48 @@ def data_to_csv(args, data, file_path):
 
 def process_highest_language_usage(args, count_data):
     """
-    Processing count data: top 10 highest language usage
+    Processing count data: Most represented languages
     """
     LOGGER.info(process_highest_language_usage.__doc__.strip())
     data = {}
 
     for row in count_data.itertuples(index=False):
-        language_name_en = row.LANGUAGE_NAME_EN
-        count = row.COUNT
-        data[language_name_en] = count
+        Language = row.LANGUAGE_NAME_EN
+        Count = row.COUNT
+        data[Language] = Count
 
-    data = pd.DataFrame(data.items(), columns=["language_name_en", "count"])
-    data.sort_values("count", ascending=False, inplace=True)
+    data = pd.DataFrame(data.items(), columns=["Language", "Count"])
+    data.sort_values("Count", ascending=False, inplace=True)
     top_10 = data.head(10)
     file_path = shared.path_join(
         PATHS["data_phase"], "wikipedia_highest_language_usage.csv"
     )
     data_to_csv(args, top_10, file_path)
 
 
+def process_least_language_usage(args, count_data):
+    """
+    Processing count data: Least represented languages
+    """
+    LOGGER.info(process_least_language_usage.__doc__.strip())
+    data = {}
+
+    for row in count_data.itertuples(index=False):
+        Language = row.LANGUAGE_NAME_EN
+        Count = row.COUNT
+
+        if Count >= 1:
+            data[Language] = Count
+
+    data = pd.DataFrame(data.items(), columns=["Language", "Count"])
+    data.sort_values("Count", ascending=True, inplace=True)
+    bottom_10 = data.head(10)
+    file_path = shared.path_join(
+        PATHS["data_phase"], "wikipedia_least_language_usage.csv"
+    )
+    data_to_csv(args, bottom_10, file_path)
+
+
 def process_language_representation(args, count_data):
     """
     Processing count data: language representation
@@ -98,22 +121,18 @@ def process_language_representation(args, count_data):
     data = {}
 
     for row in count_data.itertuples(index=False):
-        language_name_en = row.LANGUAGE_NAME_EN
-        count = row.COUNT
-        data[language_name_en] = count
+        Language = row.LANGUAGE_NAME_EN
+        Count = row.COUNT
+        data[Language] = Count
 
-    data = pd.DataFrame(data.items(), columns=["language_name_en", "count"])
-    average_count = data["count"].mean()
+    data = pd.DataFrame(data.items(), columns=["Language", "Count"])
+    average_count = data["Count"].mean()
 
-    data["category"] = data["count"].apply(
+    data["Category"] = data["Count"].apply(
         lambda x: "Underrepresented" if x < average_count else "Represented"
     )
-    language_counts = (
-        data.groupby("category").size().reset_index(name="language_count")
-    )
-    language_counts.sort_values(
-        "language_count", ascending=False, inplace=True
-    )
+    language_counts = data.groupby("Category").size().reset_index(name="Count")
+    language_counts.sort_values("Count", ascending=False, inplace=True)
     file_path = shared.path_join(
         PATHS["data_phase"], "wikipedia_language_representation.csv"
     )
@@ -129,8 +148,9 @@ def main():
         PATHS["data_1-fetch"], "wikipedia_count_by_languages.csv"
     )
     count_data = pd.read_csv(file_count, usecols=["LANGUAGE_NAME_EN", "COUNT"])
-    process_highest_language_usage(args, count_data)
     process_language_representation(args, count_data)
+    process_highest_language_usage(args, count_data)
+    process_least_language_usage(args, count_data)
 
     # Push changes
     args = shared.git_add_and_commit(
diff --git a/scripts/3-report/wikipedia_report.py b/scripts/3-report/wikipedia_report.py
@@ -82,15 +82,15 @@ def wikipedia_intro(args):
         "wikipedia_highest_language_usage.csv",
     )
     LOGGER.info(
-        f"Data file (top 10 languages):"
+        f"Data file (Most represented langauge):"
         f"{file_path_top10.replace(PATHS['repo'], '.')}"
     )
     name_label = "LANGUAGE_NAME_EN"
-    name_label_top10 = "language_name_en"
+    name_label_top10 = "Language"
     data = pd.read_csv(file_path, index_col=name_label)
     total_articles = data["COUNT"].sum()
     top10 = pd.read_csv(file_path_top10, index_col=name_label_top10)
-    top10_articles = top10["count"].sum()
+    top10_articles = top10["Count"].sum()
     top10_percentage = (top10_articles / total_articles) * 100
     average_articles = total_articles / len(data)
     language_count = len(data)
@@ -100,26 +100,71 @@ def wikipedia_intro(args):
         "Overview",
         None,
         None,
-        "This report provides insights into the usage"
-        " of the Creative Commons sharelike 4.0 across"
-        " the different language edition of Wikipedia."
-        " The wikipedia data, below, uses the `count`"
-        " field from the Wikipedia API to quantify the number of articles"
-        " in each language edition "
-        " of Wikipedia."
-        f"** The total number of Wikipedia articles across"
-        f"** {language_count} languages is"
-        f"** {total_articles:,}. The top 10 languages account for"
-        f"** {top10_articles:,} articles, which is"
-        f"** {top10_percentage:.2f}% of the total articles."
-        f"** The average number of articles per language is"
-        f"** {average_articles:,.2f}.**"
+        "This report provides insights into the usage of the Creative Commons"
+        " Attribution 4.0 International across the different language edition"
+        " of Wikipedia. The wikipedia data, below, uses the `Count` field from"
+        " the Wikipedia API to quantify the number of articles in"
+        " each language edition of Wikipedia."
+        "\n"
+        f"**The total number of Wikipedia articles across"
+        f" {language_count} languages is"
+        f" {total_articles:,}. The top 10 languages account for"
+        f" {top10_articles:,} articles, which is"
+        f" {top10_percentage:,.2f}% of the total articles."
+        f" The average number of articles per language is"
+        f" {average_articles:,.2f}.**"
         "\n"
         "Thank you to Wikipedia and the Wikimedia Foundation for"
         " making this data publicly available!",
     )
 
 
+def plot_language_representation(args):
+    """
+    Create plots showing language representation
+    """
+    LOGGER.info(plot_language_representation.__doc__.strip())
+    file_path = shared.path_join(
+        PATHS["data_2-process"],
+        "wikipedia_language_representation.csv",
+    )
+    LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}")
+    name_label = "Category"
+    data_label = "Count"
+    data = pd.read_csv(file_path, index_col=name_label)
+    data.sort_values(data_label, ascending=True, inplace=True)
+    title = "Language Representation"
+    plt = plot.combined_plot(
+        args=args,
+        data=data,
+        title=title,
+        name_label=name_label,
+        data_label=data_label,
+    )
+
+    image_path = shared.path_join(
+        PATHS["data_phase"], "wikipedia_language_representation.png"
+    )
+    LOGGER.info(f"image file: {image_path.replace(PATHS['repo'], '.')}")
+
+    if args.enable_save:
+        # Create the directory if it does not exist
+        os.makedirs(PATHS["data_phase"], exist_ok=True)
+        plt.savefig(image_path)
+
+    shared.update_readme(
+        args,
+        SECTION,
+        title,
+        image_path,
+        "Plots showing the language representation"
+        " across different language editions of Wikipedia."
+        " This shows how many languages are underrepresented"
+        " (below average number of articles) versus"
+        " represented (above average number of articles).",
+    )
+
+
 def plot_highest_language_usage(args):
     """
     Create plots showing totals by license type
@@ -130,11 +175,11 @@ def plot_highest_language_usage(args):
         "wikipedia_highest_language_usage.csv",
     )
     LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}")
-    name_label = "language_name_en"
-    data_label = "count"
+    name_label = "Language"
+    data_label = "Count"
     data = pd.read_csv(file_path, index_col=name_label)
     data.sort_values(data_label, ascending=True, inplace=True)
-    title = "Top 10 Highest Language Usage"
+    title = "Most represented languages"
     plt = plot.combined_plot(
         args=args,
         data=data,
@@ -158,29 +203,29 @@ def plot_highest_language_usage(args):
         SECTION,
         title,
         image_path,
-        "Plots showing the top 10 highest language usage"
+        "Plots showing the most represented language usage"
         " across different language editions of Wikipedia."
-        " This shows which languages have the most articles under CC BY-SA 4.0"
-        " in Wikipedia, highlighting the distribution of content"
-        " across languages.",
+        " This shows which languages have the most articles"
+        " under CC BY-SA 4.0 in Wikipedia, highlighting languages with higher"
+        " langauge representation.",
     )
 
 
-def plot_language_representation(args):
+def plot_least_language_usage(args):
     """
-    Create plots showing language representation
+    Create plots showing totals by license type
     """
-    LOGGER.info(plot_language_representation.__doc__.strip())
+    LOGGER.info(plot_least_language_usage.__doc__.strip())
     file_path = shared.path_join(
         PATHS["data_2-process"],
-        "wikipedia_language_representation.csv",
+        "wikipedia_least_language_usage.csv",
     )
     LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}")
-    name_label = "category"
-    data_label = "language_count"
+    name_label = "Language"
+    data_label = "Count"
     data = pd.read_csv(file_path, index_col=name_label)
     data.sort_values(data_label, ascending=True, inplace=True)
-    title = "Language Representation"
+    title = "Least represented languages"
     plt = plot.combined_plot(
         args=args,
         data=data,
@@ -190,7 +235,7 @@ def plot_language_representation(args):
     )
 
     image_path = shared.path_join(
-        PATHS["data_phase"], "wikipedia_language_representation.png"
+        PATHS["data_phase"], "wikipedia_least_language_usage.png"
     )
     LOGGER.info(f"image file: {image_path.replace(PATHS['repo'], '.')}")
 
@@ -204,11 +249,11 @@ def plot_language_representation(args):
         SECTION,
         title,
         image_path,
-        "Plots showing the language representation"
+        "Plots showing the least represented language usage"
         " across different language editions of Wikipedia."
-        " This shows how many languages are underrepresented"
-        " (below average number of articles) versus"
-        " represented (above average number of articles).",
+        " This shows which languages have the least articles"
+        " under CC BY-SA 4.0 in Wikipedia, highlighting"
+        " languages with lower content representation.",
     )
 
 
@@ -217,8 +262,9 @@ def main():
     shared.paths_log(LOGGER, PATHS)
     shared.git_fetch_and_merge(args, PATHS["repo"])
     wikipedia_intro(args)
-    plot_highest_language_usage(args)
     plot_language_representation(args)
+    plot_highest_language_usage(args)
+    plot_least_language_usage(args)
 
     # Add and commit changes
     args = shared.git_add_and_commit(