Skip to content

Commit 3a69e03

Browse files
[pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
1 parent 2097690 commit 3a69e03

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

62 files changed

+123
-154
lines changed

ac_dc/anonymization.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ def apply_regex_anonymization(
3030
tag_type=tag_type,
3131
)
3232
if anonymize_condition:
33-
for (ent, start, end, tag) in ner:
33+
for ent, start, end, tag in ner:
3434
# we need to actually walk through and replace by start, end span.
3535
sentence = sentence.replace(ent, f" <{tag}> ")
3636
return sentence, ner

ac_dc/deduplicate/self_deduplicate.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
#!/usr/bin/env python
2-
# -*- coding: utf-8 -*-
32
# @Date : 2022-01-08 22:39:29
43
# @Author : Chenghao Mou (mouchenghao@gmail.com)
54
# @Description: Self-deduplication with `datasets`
@@ -28,7 +27,7 @@
2827

2928
def main(conf: str) -> None:
3029

31-
with open(conf, "r") as f:
30+
with open(conf) as f:
3231
conf = yaml.safe_load(f.read())
3332

3433
if conf["load_from_disk"]["path"]:

ac_dc/languages_id.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
import pandas as pd
22

3-
43
langs_id = [
54
{
65
"lang": "Arabic",

ac_dc/normalization.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
import re
22
from typing import Dict
33

4-
54
non_printing_characters_re = re.compile(
65
f"[{''.join(map(chr, list(range(0,32)) + list(range(127,160))))}]"
76
)

ac_dc/parameters_filtering.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
import string
22
import emoji
33

4-
54
main_special_characters = string.punctuation + string.digits + string.whitespace
65
other_special_characters = (
76
"’ “— ™ – •‘œ    ˜ ‚ƒ„’“”–ー一▬…✦�­£​•€«»°·═"

ac_dc/visualization/get_data_for_visualization.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -90,9 +90,9 @@ def compute_stats(self):
9090
)
9191
for n in range(2, 16)
9292
}
93-
stats_document[
94-
"character_repetition_ratio"
95-
] = character_repetition_ratios
93+
stats_document["character_repetition_ratio"] = (
94+
character_repetition_ratios
95+
)
9696

9797
word_repetition_ratios = {
9898
n: round(

ac_dc/visualization/visualization.py

Lines changed: 20 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -290,16 +290,16 @@ def get_cond(key, cutoff, max_cutoff):
290290
"stopwords_ratio"
291291
]
292292
for i in range(len(self.docs["stopwords_ratio"])):
293-
self.docs["stopwords_ratio"].iloc[
294-
i
295-
] = Filtering.compute_stopwords_ratio(
296-
self.docs["text"].iloc[i],
297-
self.sentencepiece_model_tok,
298-
self.param["strip_characters"],
299-
self.param["cond_words_augmentation"],
300-
self.param["words_augmentation_group_sizes"],
301-
self.param["words_augmentation_join_char"],
302-
new_stopwords,
293+
self.docs["stopwords_ratio"].iloc[i] = (
294+
Filtering.compute_stopwords_ratio(
295+
self.docs["text"].iloc[i],
296+
self.sentencepiece_model_tok,
297+
self.param["strip_characters"],
298+
self.param["cond_words_augmentation"],
299+
self.param["words_augmentation_group_sizes"],
300+
self.param["words_augmentation_join_char"],
301+
new_stopwords,
302+
)
303303
)
304304
cutoff_def = "If the stop words ratio of a document is lower than this number, the document is removed."
305305
cutoff_stopwords_ratio = st.slider(
@@ -326,16 +326,16 @@ def get_cond(key, cutoff, max_cutoff):
326326
"flagged_words_ratio"
327327
]
328328
for i in range(len(self.docs["flagged_words_ratio"])):
329-
self.docs["flagged_words_ratio"].iloc[
330-
i
331-
] = Filtering.compute_flagged_words_ratio(
332-
self.docs["text"].iloc[i],
333-
self.sentencepiece_model_tok,
334-
self.param["strip_characters"],
335-
self.param["cond_words_augmentation"],
336-
self.param["words_augmentation_group_sizes"],
337-
self.param["words_augmentation_join_char"],
338-
new_flagged_words,
329+
self.docs["flagged_words_ratio"].iloc[i] = (
330+
Filtering.compute_flagged_words_ratio(
331+
self.docs["text"].iloc[i],
332+
self.sentencepiece_model_tok,
333+
self.param["strip_characters"],
334+
self.param["cond_words_augmentation"],
335+
self.param["words_augmentation_group_sizes"],
336+
self.param["words_augmentation_join_char"],
337+
new_flagged_words,
338+
)
339339
)
340340
cutoff_def = "If the flagged words ratio of a document is higher than this number, the document is removed."
341341
max_fwr = np.max(self.docs["flagged_words_ratio"])

bertin/evaluation/run_glue.py

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
#!/usr/bin/env python
2-
# coding=utf-8
32
# Copyright 2020 The HuggingFace Inc. team. All rights reserved.
43
#
54
# Licensed under the Apache License, Version 2.0 (the "License");
@@ -13,7 +12,8 @@
1312
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1413
# See the License for the specific language governing permissions and
1514
# limitations under the License.
16-
""" Finetuning the library models for sequence classification on GLUE."""
15+
"""Finetuning the library models for sequence classification on GLUE."""
16+
1717
# You can also adapt this script on your own text classification task. Pointers for this are left as comments.
1818

1919
import logging
@@ -384,19 +384,23 @@ def main():
384384
# In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently
385385
# download model & vocab.
386386
config = AutoConfig.from_pretrained(
387-
model_args.config_name
388-
if model_args.config_name
389-
else model_args.model_name_or_path,
387+
(
388+
model_args.config_name
389+
if model_args.config_name
390+
else model_args.model_name_or_path
391+
),
390392
num_labels=num_labels,
391393
finetuning_task=data_args.task_name,
392394
cache_dir=model_args.cache_dir,
393395
revision=model_args.model_revision,
394396
use_auth_token=True if model_args.use_auth_token else None,
395397
)
396398
tokenizer = AutoTokenizer.from_pretrained(
397-
model_args.tokenizer_name
398-
if model_args.tokenizer_name
399-
else model_args.model_name_or_path,
399+
(
400+
model_args.tokenizer_name
401+
if model_args.tokenizer_name
402+
else model_args.model_name_or_path
403+
),
400404
cache_dir=model_args.cache_dir,
401405
use_fast=model_args.use_fast_tokenizer,
402406
revision=model_args.model_revision,

bertin/evaluation/run_ner.py

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
#!/usr/bin/env python
2-
# coding=utf-8
32
# Copyright 2020 The HuggingFace Team All rights reserved.
43
#
54
# Licensed under the Apache License, Version 2.0 (the "License");
@@ -16,6 +15,7 @@
1615
"""
1716
Fine-tuning the library models for token classification.
1817
"""
18+
1919
# You can also adapt this script on your own token classification task and datasets. Pointers for this are left as
2020
# comments.
2121

@@ -364,9 +364,11 @@ def get_label_list(labels):
364364
# The .from_pretrained methods guarantee that only one local process can concurrently
365365
# download model & vocab.
366366
config = AutoConfig.from_pretrained(
367-
model_args.config_name
368-
if model_args.config_name
369-
else model_args.model_name_or_path,
367+
(
368+
model_args.config_name
369+
if model_args.config_name
370+
else model_args.model_name_or_path
371+
),
370372
num_labels=num_labels,
371373
label2id=label_to_id,
372374
id2label={i: l for l, i in label_to_id.items()},
@@ -636,9 +638,9 @@ def compute_metrics(p):
636638
kwargs["dataset_tags"] = data_args.dataset_name
637639
if data_args.dataset_config_name is not None:
638640
kwargs["dataset_args"] = data_args.dataset_config_name
639-
kwargs[
640-
"dataset"
641-
] = f"{data_args.dataset_name} {data_args.dataset_config_name}"
641+
kwargs["dataset"] = (
642+
f"{data_args.dataset_name} {data_args.dataset_config_name}"
643+
)
642644
else:
643645
kwargs["dataset"] = data_args.dataset_name
644646

bertin/mc4/mc4.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
"""Perplexity Sampled mC4 dataset based on Common Crawl."""
22

3-
43
import gzip
54
import json
65

@@ -404,7 +403,7 @@ def _generate_examples(self, filepaths):
404403
for filepath in filepaths:
405404
logger.info("generating examples from = %s", filepath)
406405
if filepath.endswith("jsonl"):
407-
with open(filepath, "r", encoding="utf-8") as f:
406+
with open(filepath, encoding="utf-8") as f:
408407
for line in f:
409408
if line:
410409
example = json.loads(line)

0 commit comments

Comments
 (0)