Skip to content

Commit 7525207

Browse files
committed
fixes
1 parent 87e350a commit 7525207

1 file changed

Lines changed: 18 additions & 7 deletions

File tree

text_preprocessing/preprocessor.py

Lines changed: 18 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -323,11 +323,13 @@ def __init__(
323323
language: str = "french",
324324
stemmer: bool = False,
325325
lemmatizer: Optional[str] = None,
326+
lemmatizer_path: Optional[str] = None,
326327
modernize: bool = False,
327328
ngrams: Optional[int] = None,
328329
ngram_gap: int = 0,
329330
ngram_word_order: bool = True,
330331
stopwords: Optional[str] = None,
332+
stopwords_path: Optional[str] = None,
331333
strip_punctuation: bool = True,
332334
strip_numbers: bool = True,
333335
strip_tags: bool = False,
@@ -375,6 +377,7 @@ def __init__(
375377
}
376378
if lemmatizer != "spacy":
377379
cls.lemmatizer = cls.__get_lemmatizer(lemmatizer)
380+
cls.lemmatizer_path = lemmatizer
378381
else:
379382
cls.options["spacy_lemmatizer"] = True
380383
cls.min_word_length = min_word_length
@@ -386,6 +389,7 @@ def __init__(
386389
else:
387390
cls.options["nlp"] = False
388391
cls.stopwords = cls.__get_stopwords(stopwords)
392+
cls.stopwords_path = stopwords
389393
cls.ngrams = ngrams or 0
390394
if cls.ngrams:
391395
cls.ngram_window = cls.ngrams + ngram_gap
@@ -413,6 +417,7 @@ def __str__(self):
413417
preproc_repr += f"{key}: {value}\n"
414418
if self.options["spacy_lemmatizer"] is False:
415419
preproc_repr += f"Lemmatizer path: {self.lemmatizer_path}\n"
420+
preproc_repr += f"stopwords: {self.stopwords_path}\n"
416421
preproc_repr += f"word_regex: {self.word_regex}\n"
417422
preproc_repr += f"ngrams: {self.ngrams}\n"
418423
for key, value in self.filter_config.items():
@@ -500,7 +505,7 @@ def process_string(cls, text, keep_all=True):
500505
tokens = list(cls.tokenize_text(text))
501506
cls.keep_all = keep_all
502507
if cls.options["with_pos"] is True or cls.pos_to_keep or cls.options["spacy_lemmatizer"] is True:
503-
tokens = cls.__run_nlp(tokens)
508+
tokens = cls.__run_nlp(tokens, from_string=True)
504509
elif cls.lemmatizer and cls.options["spacy_lemmatizer"] is False:
505510
tokens = [Token(cls.lemmatizer.get(word, word), word.surface_form, ext=word.ext) for word in tokens]
506511
return cls.__normalize_doc(tokens)
@@ -602,8 +607,11 @@ def __get_lemmatizer(cls, file_path: Optional[str]) -> Dict[str, str]:
602607
return lemmas
603608

604609
@classmethod
605-
def __run_nlp(cls, tokens: Iterable[Token]) -> List[Token]:
606-
text_data = " ".join(tokens)
610+
def __run_nlp(cls, tokens: Iterable[Token], from_string=False) -> List[Token]:
611+
if from_string is True:
612+
text_data = "".join(tokens)
613+
else:
614+
text_data = " ".join(tokens)
607615
try:
608616
doc = cls.nlp(text_data)
609617
except ValueError: # text is longer than 1,000,000 characters (default spacy limit)
@@ -619,22 +627,25 @@ def __run_nlp(cls, tokens: Iterable[Token]) -> List[Token]:
619627
if cls.filter_config["pos_to_keep"] is not None and token.pos_ not in cls.filter_config["pos_to_keep"]:
620628
keep_token = False
621629
if keep_token is False and cls.keep_all is True:
622-
processed_doc.append(Token("", old_token.surface_form, token.pos_, token.ent_type_, old_token.ext))
630+
processed_doc.append(Token("", token.text, token.pos_, token.ent_type_, old_token.ext))
631+
if token.whitespace_:
632+
processed_doc.append(Token(token.whitespace_, token.whitespace_, "", "", old_token.ext))
623633
continue
624-
# print(keep_token)
625634
if cls.options["spacy_lemmatizer"] is True:
626-
new_token = Token(token.lemma_, old_token.surface_form, token.pos_, token.ent_type_, old_token.ext)
635+
new_token = Token(token.lemma_, token.text, token.pos_, token.ent_type_, old_token.ext)
627636
elif cls.lemmatizer:
628637
new_token = Token(
629638
cls.lemmatizer.get(token.text.lower(), token.text.lower()),
630-
old_token.surface_form,
639+
token.text,
631640
token.pos_,
632641
token.ent_type_,
633642
old_token.ext,
634643
)
635644
else:
636645
new_token = Token(token.text, old_token.surface_form, token.pos_, token.ent_type_, old_token.ext)
637646
processed_doc.append(new_token)
647+
if token.whitespace_:
648+
processed_doc.append(Token(token.whitespace_, token.whitespace_, "", "", old_token.ext))
638649
return processed_doc
639650

640651
@classmethod

0 commit comments

Comments
 (0)