@@ -323,11 +323,13 @@ def __init__(
323323 language : str = "french" ,
324324 stemmer : bool = False ,
325325 lemmatizer : Optional [str ] = None ,
326+ lemmatizer_path : Optional [str ] = None ,
326327 modernize : bool = False ,
327328 ngrams : Optional [int ] = None ,
328329 ngram_gap : int = 0 ,
329330 ngram_word_order : bool = True ,
330331 stopwords : Optional [str ] = None ,
332+ stopwords_path : Optional [str ] = None ,
331333 strip_punctuation : bool = True ,
332334 strip_numbers : bool = True ,
333335 strip_tags : bool = False ,
@@ -375,6 +377,7 @@ def __init__(
375377 }
376378 if lemmatizer != "spacy" :
377379 cls .lemmatizer = cls .__get_lemmatizer (lemmatizer )
380+ cls .lemmatizer_path = lemmatizer
378381 else :
379382 cls .options ["spacy_lemmatizer" ] = True
380383 cls .min_word_length = min_word_length
@@ -386,6 +389,7 @@ def __init__(
386389 else :
387390 cls .options ["nlp" ] = False
388391 cls .stopwords = cls .__get_stopwords (stopwords )
392+ cls .stopwords_path = stopwords
389393 cls .ngrams = ngrams or 0
390394 if cls .ngrams :
391395 cls .ngram_window = cls .ngrams + ngram_gap
@@ -413,6 +417,7 @@ def __str__(self):
413417 preproc_repr += f"{ key } : { value } \n "
414418 if self .options ["spacy_lemmatizer" ] is False :
415419 preproc_repr += f"Lemmatizer path: { self .lemmatizer_path } \n "
420+ preproc_repr += f"stopwords: { self .stopwords_path } \n "
416421 preproc_repr += f"word_regex: { self .word_regex } \n "
417422 preproc_repr += f"ngrams: { self .ngrams } \n "
418423 for key , value in self .filter_config .items ():
@@ -500,7 +505,7 @@ def process_string(cls, text, keep_all=True):
500505 tokens = list (cls .tokenize_text (text ))
501506 cls .keep_all = keep_all
502507 if cls .options ["with_pos" ] is True or cls .pos_to_keep or cls .options ["spacy_lemmatizer" ] is True :
503- tokens = cls .__run_nlp (tokens )
508+ tokens = cls .__run_nlp (tokens , from_string = True )
504509 elif cls .lemmatizer and cls .options ["spacy_lemmatizer" ] is False :
505510 tokens = [Token (cls .lemmatizer .get (word , word ), word .surface_form , ext = word .ext ) for word in tokens ]
506511 return cls .__normalize_doc (tokens )
@@ -602,8 +607,11 @@ def __get_lemmatizer(cls, file_path: Optional[str]) -> Dict[str, str]:
602607 return lemmas
603608
604609 @classmethod
605- def __run_nlp (cls , tokens : Iterable [Token ]) -> List [Token ]:
606- text_data = " " .join (tokens )
610+ def __run_nlp (cls , tokens : Iterable [Token ], from_string = False ) -> List [Token ]:
611+ if from_string is True :
612+ text_data = "" .join (tokens )
613+ else :
614+ text_data = " " .join (tokens )
607615 try :
608616 doc = cls .nlp (text_data )
609617 except ValueError : # text is longer than 1,000,000 characters (default spacy limit)
@@ -619,22 +627,25 @@ def __run_nlp(cls, tokens: Iterable[Token]) -> List[Token]:
619627 if cls .filter_config ["pos_to_keep" ] is not None and token .pos_ not in cls .filter_config ["pos_to_keep" ]:
620628 keep_token = False
621629 if keep_token is False and cls .keep_all is True :
622- processed_doc .append (Token ("" , old_token .surface_form , token .pos_ , token .ent_type_ , old_token .ext ))
630+ processed_doc .append (Token ("" , token .text , token .pos_ , token .ent_type_ , old_token .ext ))
631+ if token .whitespace_ :
632+ processed_doc .append (Token (token .whitespace_ , token .whitespace_ , "" , "" , old_token .ext ))
623633 continue
624- # print(keep_token)
625634 if cls .options ["spacy_lemmatizer" ] is True :
626- new_token = Token (token .lemma_ , old_token . surface_form , token .pos_ , token .ent_type_ , old_token .ext )
635+ new_token = Token (token .lemma_ , token . text , token .pos_ , token .ent_type_ , old_token .ext )
627636 elif cls .lemmatizer :
628637 new_token = Token (
629638 cls .lemmatizer .get (token .text .lower (), token .text .lower ()),
630- old_token . surface_form ,
639+ token . text ,
631640 token .pos_ ,
632641 token .ent_type_ ,
633642 old_token .ext ,
634643 )
635644 else :
636645 new_token = Token (token .text , old_token .surface_form , token .pos_ , token .ent_type_ , old_token .ext )
637646 processed_doc .append (new_token )
647+ if token .whitespace_ :
648+ processed_doc .append (Token (token .whitespace_ , token .whitespace_ , "" , "" , old_token .ext ))
638649 return processed_doc
639650
640651 @classmethod
0 commit comments