bigscience-workshop
diff --git a/‎ac_dc/anonymization.py‎
Lines changed: 1 addition & 1 deletion b/‎ac_dc/anonymization.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎ac_dc/deduplicate/self_deduplicate.py‎
Lines changed: 1 addition & 2 deletions b/‎ac_dc/deduplicate/self_deduplicate.py‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎ac_dc/languages_id.py‎
Lines changed: 0 additions & 1 deletion b/‎ac_dc/languages_id.py‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎ac_dc/normalization.py‎
Lines changed: 0 additions & 1 deletion b/‎ac_dc/normalization.py‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎ac_dc/parameters_filtering.py‎
Lines changed: 0 additions & 1 deletion b/‎ac_dc/parameters_filtering.py‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎ac_dc/visualization/get_data_for_visualization.py‎
Lines changed: 3 additions & 3 deletions b/‎ac_dc/visualization/get_data_for_visualization.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎ac_dc/visualization/visualization.py‎
Lines changed: 20 additions & 20 deletions b/‎ac_dc/visualization/visualization.py‎
Lines changed: 20 additions & 20 deletions
diff --git a/‎bertin/evaluation/run_glue.py‎
Lines changed: 12 additions & 8 deletions b/‎bertin/evaluation/run_glue.py‎
Lines changed: 12 additions & 8 deletions
diff --git a/‎bertin/evaluation/run_ner.py‎
Lines changed: 9 additions & 7 deletions b/‎bertin/evaluation/run_ner.py‎
Lines changed: 9 additions & 7 deletions
diff --git a/‎bertin/mc4/mc4.py‎
Lines changed: 1 addition & 2 deletions b/‎bertin/mc4/mc4.py‎
Lines changed: 1 addition & 2 deletions
@@ -30,7 +30,7 @@ def apply_regex_anonymization(
         tag_type=tag_type,
     )
     if anonymize_condition:
-        for (ent, start, end, tag) in ner:
+        for ent, start, end, tag in ner:
             # we need to actually walk through and replace by start, end span.
             sentence = sentence.replace(ent, f" <{tag}> ")
     return sentence, ner
@@ -1,5 +1,4 @@
 #!/usr/bin/env python
-# -*- coding: utf-8 -*-
 # @Date       : 2022-01-08 22:39:29
 # @Author     : Chenghao Mou (mouchenghao@gmail.com)
 # @Description: Self-deduplication with `datasets`
@@ -28,7 +27,7 @@
 
 def main(conf: str) -> None:
 
-    with open(conf, "r") as f:
+    with open(conf) as f:
         conf = yaml.safe_load(f.read())
 
     if conf["load_from_disk"]["path"]:
 
@@ -1,6 +1,5 @@
 import pandas as pd
 
-
 langs_id = [
     {
         "lang": "Arabic",
 
@@ -1,7 +1,6 @@
 import re
 from typing import Dict
 
-
 non_printing_characters_re = re.compile(
     f"[{''.join(map(chr, list(range(0,32)) + list(range(127,160))))}]"
 )
 
@@ -1,7 +1,6 @@
 import string
 import emoji
 
-
 main_special_characters = string.punctuation + string.digits + string.whitespace
 other_special_characters = (
     "    　    ’“”–ー一▬…✦�£•€«»°·═"
 
@@ -90,9 +90,9 @@ def compute_stats(self):
                     )
                     for n in range(2, 16)
                 }
-                stats_document[
-                    "character_repetition_ratio"
-                ] = character_repetition_ratios
+                stats_document["character_repetition_ratio"] = (
+                    character_repetition_ratios
+                )
 
                 word_repetition_ratios = {
                     n: round(
 
@@ -290,16 +290,16 @@ def get_cond(key, cutoff, max_cutoff):
                             "stopwords_ratio"
                         ]
                         for i in range(len(self.docs["stopwords_ratio"])):
-                            self.docs["stopwords_ratio"].iloc[
-                                i
-                            ] = Filtering.compute_stopwords_ratio(
-                                self.docs["text"].iloc[i],
-                                self.sentencepiece_model_tok,
-                                self.param["strip_characters"],
-                                self.param["cond_words_augmentation"],
-                                self.param["words_augmentation_group_sizes"],
-                                self.param["words_augmentation_join_char"],
-                                new_stopwords,
+                            self.docs["stopwords_ratio"].iloc[i] = (
+                                Filtering.compute_stopwords_ratio(
+                                    self.docs["text"].iloc[i],
+                                    self.sentencepiece_model_tok,
+                                    self.param["strip_characters"],
+                                    self.param["cond_words_augmentation"],
+                                    self.param["words_augmentation_group_sizes"],
+                                    self.param["words_augmentation_join_char"],
+                                    new_stopwords,
+                                )
                             )
                     cutoff_def = "If the stop words ratio of a document is lower than this number, the document is removed."
                     cutoff_stopwords_ratio = st.slider(
@@ -326,16 +326,16 @@ def get_cond(key, cutoff, max_cutoff):
                             "flagged_words_ratio"
                         ]
                         for i in range(len(self.docs["flagged_words_ratio"])):
-                            self.docs["flagged_words_ratio"].iloc[
-                                i
-                            ] = Filtering.compute_flagged_words_ratio(
-                                self.docs["text"].iloc[i],
-                                self.sentencepiece_model_tok,
-                                self.param["strip_characters"],
-                                self.param["cond_words_augmentation"],
-                                self.param["words_augmentation_group_sizes"],
-                                self.param["words_augmentation_join_char"],
-                                new_flagged_words,
+                            self.docs["flagged_words_ratio"].iloc[i] = (
+                                Filtering.compute_flagged_words_ratio(
+                                    self.docs["text"].iloc[i],
+                                    self.sentencepiece_model_tok,
+                                    self.param["strip_characters"],
+                                    self.param["cond_words_augmentation"],
+                                    self.param["words_augmentation_group_sizes"],
+                                    self.param["words_augmentation_join_char"],
+                                    new_flagged_words,
+                                )
                             )
                     cutoff_def = "If the flagged words ratio of a document is higher than this number, the document is removed."
                     max_fwr = np.max(self.docs["flagged_words_ratio"])
 
@@ -1,5 +1,4 @@
 #!/usr/bin/env python
-# coding=utf-8
 # Copyright 2020 The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -13,7 +12,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Finetuning the library models for sequence classification on GLUE."""
+"""Finetuning the library models for sequence classification on GLUE."""
+
 # You can also adapt this script on your own text classification task. Pointers for this are left as comments.
 
 import logging
@@ -384,19 +384,23 @@ def main():
     # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently
     # download model & vocab.
     config = AutoConfig.from_pretrained(
-        model_args.config_name
-        if model_args.config_name
-        else model_args.model_name_or_path,
+        (
+            model_args.config_name
+            if model_args.config_name
+            else model_args.model_name_or_path
+        ),
         num_labels=num_labels,
         finetuning_task=data_args.task_name,
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
         use_auth_token=True if model_args.use_auth_token else None,
     )
     tokenizer = AutoTokenizer.from_pretrained(
-        model_args.tokenizer_name
-        if model_args.tokenizer_name
-        else model_args.model_name_or_path,
+        (
+            model_args.tokenizer_name
+            if model_args.tokenizer_name
+            else model_args.model_name_or_path
+        ),
         cache_dir=model_args.cache_dir,
         use_fast=model_args.use_fast_tokenizer,
         revision=model_args.model_revision,
 
@@ -1,5 +1,4 @@
 #!/usr/bin/env python
-# coding=utf-8
 # Copyright 2020 The HuggingFace Team All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -16,6 +15,7 @@
 """
 Fine-tuning the library models for token classification.
 """
+
 # You can also adapt this script on your own token classification task and datasets. Pointers for this are left as
 # comments.
 
@@ -364,9 +364,11 @@ def get_label_list(labels):
     # The .from_pretrained methods guarantee that only one local process can concurrently
     # download model & vocab.
     config = AutoConfig.from_pretrained(
-        model_args.config_name
-        if model_args.config_name
-        else model_args.model_name_or_path,
+        (
+            model_args.config_name
+            if model_args.config_name
+            else model_args.model_name_or_path
+        ),
         num_labels=num_labels,
         label2id=label_to_id,
         id2label={i: l for l, i in label_to_id.items()},
@@ -636,9 +638,9 @@ def compute_metrics(p):
             kwargs["dataset_tags"] = data_args.dataset_name
             if data_args.dataset_config_name is not None:
                 kwargs["dataset_args"] = data_args.dataset_config_name
-                kwargs[
-                    "dataset"
-                ] = f"{data_args.dataset_name} {data_args.dataset_config_name}"
+                kwargs["dataset"] = (
+                    f"{data_args.dataset_name} {data_args.dataset_config_name}"
+                )
             else:
                 kwargs["dataset"] = data_args.dataset_name
 
 
@@ -1,6 +1,5 @@
 """Perplexity Sampled mC4 dataset based on Common Crawl."""
 
-
 import gzip
 import json
 
@@ -404,7 +403,7 @@ def _generate_examples(self, filepaths):
         for filepath in filepaths:
             logger.info("generating examples from = %s", filepath)
             if filepath.endswith("jsonl"):
-                with open(filepath, "r", encoding="utf-8") as f:
+                with open(filepath, encoding="utf-8") as f:
                     for line in f:
                         if line:
                             example = json.loads(line)
Original file line number	Diff line number	Diff line change
`@@ -30,7 +30,7 @@ def apply_regex_anonymization(`
`30`	`30`	`tag_type=tag_type,`
`31`	`31`	`)`
`32`	`32`	`if anonymize_condition:`
`33`		`- for (ent, start, end, tag) in ner:`
	`33`	`+ for ent, start, end, tag in ner:`
`34`	`34`	`# we need to actually walk through and replace by start, end span.`
`35`	`35`	`sentence = sentence.replace(ent, f" <{tag}> ")`
`36`	`36`	`return sentence, ner`
Original file line number	Diff line number	Diff line change
`@@ -1,6 +1,5 @@`
`1`	`1`	`import pandas as pd`
`2`	`2`
`3`		`-`
`4`	`3`	`langs_id = [`
`5`	`4`	`{`
`6`	`5`	`"lang": "Arabic",`
Original file line number	Diff line number	Diff line change
`@@ -1,7 +1,6 @@`
`1`	`1`	`import re`
`2`	`2`	`from typing import Dict`
`3`	`3`
`4`		`-`
`5`	`4`	`non_printing_characters_re = re.compile(`
`6`	`5`	`f"[{''.join(map(chr, list(range(0,32)) + list(range(127,160))))}]"`
`7`	`6`	`)`
Original file line number	Diff line number	Diff line change
`@@ -90,9 +90,9 @@ def compute_stats(self):`
`90`	`90`	`)`
`91`	`91`	`for n in range(2, 16)`
`92`	`92`	`}`
`93`		`- stats_document[`
`94`		`- "character_repetition_ratio"`
`95`		`- ] = character_repetition_ratios`
	`93`	`+ stats_document["character_repetition_ratio"] = (`
	`94`	`+ character_repetition_ratios`
	`95`	`+ )`
`96`	`96`
`97`	`97`	`word_repetition_ratios = {`
`98`	`98`	`n: round(`