Merge branch 'main' into europeana-feature

Joyakis · Joyakis · commit 47b5f20b27f0 · 2025-10-30T13:27:21.000+03:00
diff --git a/dev/create_gcs_query_plan.py b/dev/create_gcs_query_plan.py
@@ -117,7 +117,7 @@ def get_tool_urls():
     file_path = shared.path_join(PATHS["data"], "legal-tool-paths.txt")
     prefix = "//creativecommons.org/"
     tool_urls = []
-    with open(file_path, "r") as file_obj:
+    with open(file_path, "r", encoding="utf-8") as file_obj:
         for line in file_obj:
             tool_urls.append(f"{prefix}{line.strip()}")
     LOGGER.info("Prioritizing CC Legal Tool URLs")
@@ -127,14 +127,14 @@ def get_tool_urls():
 
 def load_countries():
     file_path = shared.path_join(PATHS["data"], "gcs_country_collection.yaml")
-    with open(file_path, "r") as file_obj:
+    with open(file_path, "r", encoding="utf-8") as file_obj:
         countries = yaml.safe_load(file_obj)
     return countries
 
 
 def load_languages():
     file_path = shared.path_join(PATHS["data"], "gcs_language_collection.yaml")
-    with open(file_path, "r") as file_obj:
+    with open(file_path, "r", encoding="utf-8") as file_obj:
         languages = yaml.safe_load(file_obj)
     return languages
 
@@ -209,7 +209,7 @@ def save_plan(plan):
         "LANGUAGE",
         "LR",
     ]
-    with open(file_path, "w") as file_obj:
+    with open(file_path, "w", encoding="utf-8", newline="\n") as file_obj:
         writer = csv.DictWriter(
             file_obj, fieldnames=fieldnames, dialect="unix"
         )
diff --git a/dev/prioritize_tools.py b/dev/prioritize_tools.py
@@ -42,7 +42,7 @@ def get_tool_urls():
     file_path = shared.path_join(PATHS["data"], "legal-tool-paths.txt")
     prefix = "//creativecommons.org/"
     tool_urls = []
-    with open(file_path, "r") as file_obj:
+    with open(file_path, "r", encoding="utf-8") as file_obj:
         for line in file_obj:
             tool_urls.append(f"{prefix}{line.strip()}")
     return tool_urls
@@ -112,7 +112,7 @@ def save_tools_list(tool_urls):
     LOGGER.info("Saving prioritized CC Legal Tool URLs")
     file_path = shared.path_join(PATHS["data"], "prioritized-tool-urls.txt")
     tool_urls.append("")  # ensure file has end of file newline
-    with open(file_path, "w") as file_obj:
+    with open(file_path, "w", encoding="utf-8", newline="\n") as file_obj:
         file_obj.writelines("\n".join(tool_urls))
 
 
diff --git a/scripts/1-fetch/gcs_fetch.py b/scripts/1-fetch/gcs_fetch.py
@@ -104,7 +104,7 @@ def get_search_service():
 
 def initialize_data_file(file_path, header):
     if not os.path.isfile(file_path):
-        with open(file_path, "w", newline="") as file_obj:
+        with open(file_path, "w", encoding="utf-8", newline="\n") as file_obj:
             writer = csv.DictWriter(
                 file_obj, fieldnames=header, dialect="unix"
             )
@@ -127,7 +127,7 @@ def get_last_completed_plan_index():
     last_completed_plan_index = 0
     for file_path in [FILE1_COUNT, FILE2_LANGUAGE, FILE3_COUNTRY]:
         try:
-            with open(file_path, "r", newline="") as file_obj:
+            with open(file_path, "r", encoding="utf-8") as file_obj:
                 reader = csv.DictReader(file_obj, dialect="unix")
                 for row in reader:
                     pass  # skip through to last row
@@ -147,7 +147,7 @@ def get_last_completed_plan_index():
 def load_plan():
     plan = []
     file_path = shared.path_join(PATHS["data"], "gcs_query_plan.csv")
-    with open(file_path, "r", newline="") as file_obj:
+    with open(file_path, "r", encoding="utf-8") as file_obj:
         plan = list(csv.DictReader(file_obj, dialect="unix"))
     return plan
 
@@ -181,7 +181,7 @@ def append_data(args, plan_row, index, count):
             "TOOL_IDENTIFIER": plan_row["TOOL_IDENTIFIER"],
             "COUNT": count,
         }
-    with open(file_path, "a", newline="") as file_obj:
+    with open(file_path, "a", encoding="utf-8", newline="\n") as file_obj:
         writer = csv.DictWriter(
             file_obj, fieldnames=fieldnames, dialect="unix"
         )
diff --git a/scripts/1-fetch/github_fetch.py b/scripts/1-fetch/github_fetch.py
@@ -110,7 +110,7 @@ def write_data(args, tool_data):
         LOGGER.error("Unable to fetch all records. Aborting.")
         return args
 
-    with open(FILE1_COUNT, "w", newline="") as file_obj:
+    with open(FILE1_COUNT, "w", encoding="utf-8", newline="\n") as file_obj:
         writer = csv.DictWriter(
             file_obj, fieldnames=HEADER1_COUNT, dialect="unix"
         )
diff --git a/scripts/1-fetch/wikipedia_fetch.py b/scripts/1-fetch/wikipedia_fetch.py
@@ -83,7 +83,7 @@ def write_data(args, tool_data):
     LOGGER.info("Saving fetched data")
     os.makedirs(PATHS["data_phase"], exist_ok=True)
 
-    with open(FILE_LANGUAGES, "w", newline="", encoding="utf-8") as file_obj:
+    with open(FILE_LANGUAGES, "w", encoding="utf-8", newline="\n") as file_obj:
         writer = csv.DictWriter(
             file_obj, fieldnames=HEADER_LANGUAGES, dialect="unix"
         )
diff --git a/scripts/2-process/github_process.py b/scripts/2-process/github_process.py
@@ -1,14 +1,18 @@
 #!/usr/bin/env python
 """
-This file is dedicated to processing Github data
+This file is dedicated to processing GitHub data
 for analysis and comparison between quarters.
 """
 # Standard library
+import argparse
+import csv
 import os
 import sys
 import traceback
 
+# Third-party
 # import pandas as pd
+import pandas as pd
 
 # Add parent directory so shared can be imported
 sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
@@ -19,6 +23,112 @@
 # Setup
 LOGGER, PATHS = shared.setup(__file__)
 
+# Constants
+QUARTER = os.path.basename(PATHS["data_quarter"])
+
+
+def parse_arguments():
+    """
+    Parse command-line options, returns parsed argument namespace.
+    """
+    LOGGER.info("Parsing command-line options")
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument(
+        "--quarter",
+        default=QUARTER,
+        help=f"Data quarter in format YYYYQx (default: {QUARTER})",
+    )
+    parser.add_argument(
+        "--enable-save",
+        action="store_true",
+        help="Enable saving results (default: False)",
+    )
+    parser.add_argument(
+        "--enable-git",
+        action="store_true",
+        help="Enable git actions such as fetch, merge, add, commit, and push"
+        " (default: False)",
+    )
+    args = parser.parse_args()
+    if not args.enable_save and args.enable_git:
+        parser.error("--enable-git requires --enable-save")
+    if args.quarter != QUARTER:
+        global PATHS
+        PATHS = shared.paths_update(LOGGER, PATHS, QUARTER, args.quarter)
+    args.logger = LOGGER
+    args.paths = PATHS
+    return args
+
+
+def data_to_csv(args, data, file_path):
+    if not args.enable_save:
+        return
+    os.makedirs(PATHS["data_phase"], exist_ok=True)
+    # emulate csv.unix_dialect
+    data.to_csv(
+        file_path, index=False, quoting=csv.QUOTE_ALL, lineterminator="\n"
+    )
+
+
+def process_totals_by_license(args, count_data):
+    """
+    Processing count data: totals by License
+    """
+    LOGGER.info(process_totals_by_license.__doc__.strip())
+    data = {}
+
+    for row in count_data.itertuples(index=False):
+        tool = str(row.TOOL_IDENTIFIER)
+        count = int(row.COUNT)
+
+        if tool == "Total public repositories":
+            continue
+
+        data[tool] = count
+
+    data = pd.DataFrame(data.items(), columns=["License", "Count"])
+    data.sort_values("License", ascending=True, inplace=True)
+    data.reset_index(drop=True, inplace=True)
+    file_path = shared.path_join(
+        PATHS["data_phase"], "github_totals_by_license.csv"
+    )
+    data_to_csv(args, data, file_path)
+
+
+def process_totals_by_restriction(args, count_data):
+    """
+    Processing count data: totals by restriction
+    """
+    # https://creativecommons.org/public-domain/freeworks/
+    LOGGER.info(process_totals_by_restriction.__doc__.strip())
+    data = {"Copyleft": 0, "Permissive": 0, "Public domain": 0}
+
+    for row in count_data.itertuples(index=False):
+        tool = str(row.TOOL_IDENTIFIER)
+        count = int(row.COUNT)
+
+        if tool == "Total public repositories":
+            continue
+
+        if tool in ["BSD Zero Clause License", "CC0 1.0", "Unlicense"]:
+            key = "Public domain"
+        elif tool in ["MIT No Attribution", "CC BY 4.0"]:
+            key = "Permissive"
+        elif tool in ["CC BY-SA 4.0"]:
+            key = "Copyleft"
+        else:
+            continue
+
+        data[key] += count
+    data = pd.DataFrame(data.items(), columns=["Category", "Count"])
+    data.sort_values("Category", ascending=True, inplace=True)
+    data.reset_index(drop=True, inplace=True)
+    file_path = shared.path_join(
+        PATHS["data_phase"], "github_totals_by_restriction.csv"
+    )
+    data_to_csv(args, data, file_path)
+
+
 # def load_quarter_data(quarter):
 #     """
 #     Load data for a specific quarter.
@@ -63,18 +173,23 @@
 
 
 def main():
-    raise shared.QuantifyingException("No current code for Phase 2", 0)
-
-    # # Fetch and merge changes
-    # shared.fetch_and_merge(PATHS["repo"])
-
-    # # Add and commit changes
-    # shared.add_and_commit(
-    #     PATHS["repo"], PATHS["data_quarter"], "Fetched and updated new data"
-    # )
-
-    # # Push changes
-    # shared.push_changes(PATHS["repo"])
+    args = parse_arguments()
+    shared.paths_log(LOGGER, PATHS)
+    shared.git_fetch_and_merge(args, PATHS["repo"])
+
+    file_count = shared.path_join(PATHS["data_1-fetch"], "github_1_count.csv")
+    count_data = pd.read_csv(file_count, usecols=["TOOL_IDENTIFIER", "COUNT"])
+    process_totals_by_license(args, count_data)
+    process_totals_by_restriction(args, count_data)
+
+    # Push changes
+    args = shared.git_add_and_commit(
+        args,
+        PATHS["repo"],
+        PATHS["data_quarter"],
+        f"Add and commit new GitHub data for {QUARTER}",
+    )
+    shared.git_push_changes(args, PATHS["repo"])
 
 
 if __name__ == "__main__":
diff --git a/scripts/3-report/github_report.py b/scripts/3-report/github_report.py
diff --git a/scripts/shared.py b/scripts/shared.py

Original file line number	Diff line number	Diff line change
`@@ -110,7 +110,7 @@ def write_data(args, tool_data):`
`110`	`110`	`LOGGER.error("Unable to fetch all records. Aborting.")`
`111`	`111`	`return args`
`112`	`112`
`113`		`- with open(FILE1_COUNT, "w", newline="") as file_obj:`
	`113`	`+ with open(FILE1_COUNT, "w", encoding="utf-8", newline="\n") as file_obj:`
`114`	`114`	`writer = csv.DictWriter(`
`115`	`115`	`file_obj, fieldnames=HEADER1_COUNT, dialect="unix"`
`116`	`116`	`)`
Original file line number	Diff line number	Diff line change
`@@ -83,7 +83,7 @@ def write_data(args, tool_data):`
`83`	`83`	`LOGGER.info("Saving fetched data")`
`84`	`84`	`os.makedirs(PATHS["data_phase"], exist_ok=True)`
`85`	`85`
`86`		`- with open(FILE_LANGUAGES, "w", newline="", encoding="utf-8") as file_obj:`
	`86`	`+ with open(FILE_LANGUAGES, "w", encoding="utf-8", newline="\n") as file_obj:`
`87`	`87`	`writer = csv.DictWriter(`
`88`	`88`	`file_obj, fieldnames=HEADER_LANGUAGES, dialect="unix"`
`89`	`89`	`)`