Skip to content

Commit 87e350a

Browse files
committed
fix issue with filtering
1 parent 2a353e6 commit 87e350a

3 files changed

Lines changed: 29 additions & 7 deletions

File tree

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66

77
setup(
88
name="text_preprocessing",
9-
version="1.0",
9+
version="1.0rc3",
1010
author="The ARTFL Project",
1111
author_email="clovisgladstone@gmail.com",
1212
packages=["text_preprocessing", "text_preprocessing.lang"],

text_preprocessing/preprocessor.py

Lines changed: 26 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -299,6 +299,7 @@ class PreProcessor:
299299
sentence_regex: str = r"[.!?]+"
300300
language: str = "french"
301301
lemmatizer: Dict[str, str] = {}
302+
lemmatizer_path: str = ""
302303
modernize: Callable = lambda x: x # workaround for mypy
303304
ngrams: int = 0
304305
ngram_gap: int = 0
@@ -401,6 +402,24 @@ def __init__(
401402
cls.workers = workers
402403
cls.post_func = post_processing_function
403404

405+
def __repr__(self):
406+
return self.__str__()
407+
408+
def __str__(self):
409+
preproc_repr = ""
410+
preproc_repr += "### Preprocessing options ###\n"
411+
preproc_repr += f"Language: {self.language}\n"
412+
for key, value in self.options.items():
413+
preproc_repr += f"{key}: {value}\n"
414+
if self.options["spacy_lemmatizer"] is False:
415+
preproc_repr += f"Lemmatizer path: {self.lemmatizer_path}\n"
416+
preproc_repr += f"word_regex: {self.word_regex}\n"
417+
preproc_repr += f"ngrams: {self.ngrams}\n"
418+
for key, value in self.filter_config.items():
419+
preproc_repr += f"{key}: {value}\n"
420+
preproc_repr += f"text_object_type: {self.text_object_type}\n"
421+
return preproc_repr
422+
404423
@classmethod
405424
def process_texts(
406425
cls,
@@ -593,15 +612,16 @@ def __run_nlp(cls, tokens: Iterable[Token]) -> List[Token]:
593612
doc = nlp(text_data)
594613
processed_doc: List[Token] = []
595614
for token, old_token in zip(doc, tokens):
596-
keep_token = False
597-
if cls.filter_config["ents_to_keep"] is not None and token.ent_type_ in cls.filter_config["ents_to_keep"]:
598-
keep_token = True
599-
if keep_token is False and cls.filter_config["pos_to_keep"] is not None:
600-
if token.pos_ in cls.filter_config["pos_to_keep"]:
601-
keep_token = True
615+
keep_token = True
616+
if cls.filter_config["ents_to_keep"] is not None and token.ent_type_ != "":
617+
if token.ent_type_ not in cls.filter_config["ents_to_keep"]:
618+
keep_token = False
619+
if cls.filter_config["pos_to_keep"] is not None and token.pos_ not in cls.filter_config["pos_to_keep"]:
620+
keep_token = False
602621
if keep_token is False and cls.keep_all is True:
603622
processed_doc.append(Token("", old_token.surface_form, token.pos_, token.ent_type_, old_token.ext))
604623
continue
624+
# print(keep_token)
605625
if cls.options["spacy_lemmatizer"] is True:
606626
new_token = Token(token.lemma_, old_token.surface_form, token.pos_, token.ent_type_, old_token.ext)
607627
elif cls.lemmatizer:

text_preprocessing/spacy_helpers.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
"""Helper functions for Spacy"""
2+
13
import re
24
from typing import Any, Dict, List
35

0 commit comments

Comments
 (0)