@@ -299,6 +299,7 @@ class PreProcessor:
299299 sentence_regex : str = r"[.!?]+"
300300 language : str = "french"
301301 lemmatizer : Dict [str , str ] = {}
302+ lemmatizer_path : str = ""
302303 modernize : Callable = lambda x : x # workaround for mypy
303304 ngrams : int = 0
304305 ngram_gap : int = 0
@@ -401,6 +402,24 @@ def __init__(
401402 cls .workers = workers
402403 cls .post_func = post_processing_function
403404
405+ def __repr__ (self ):
406+ return self .__str__ ()
407+
408+ def __str__ (self ):
409+ preproc_repr = ""
410+ preproc_repr += "### Preprocessing options ###\n "
411+ preproc_repr += f"Language: { self .language } \n "
412+ for key , value in self .options .items ():
413+ preproc_repr += f"{ key } : { value } \n "
414+ if self .options ["spacy_lemmatizer" ] is False :
415+ preproc_repr += f"Lemmatizer path: { self .lemmatizer_path } \n "
416+ preproc_repr += f"word_regex: { self .word_regex } \n "
417+ preproc_repr += f"ngrams: { self .ngrams } \n "
418+ for key , value in self .filter_config .items ():
419+ preproc_repr += f"{ key } : { value } \n "
420+ preproc_repr += f"text_object_type: { self .text_object_type } \n "
421+ return preproc_repr
422+
404423 @classmethod
405424 def process_texts (
406425 cls ,
@@ -593,15 +612,16 @@ def __run_nlp(cls, tokens: Iterable[Token]) -> List[Token]:
593612 doc = nlp (text_data )
594613 processed_doc : List [Token ] = []
595614 for token , old_token in zip (doc , tokens ):
596- keep_token = False
597- if cls .filter_config ["ents_to_keep" ] is not None and token .ent_type_ in cls . filter_config [ "ents_to_keep" ] :
598- keep_token = True
599- if keep_token is False and cls . filter_config [ "pos_to_keep" ] is not None :
600- if token .pos_ in cls .filter_config ["pos_to_keep" ]:
601- keep_token = True
615+ keep_token = True
616+ if cls .filter_config ["ents_to_keep" ] is not None and token .ent_type_ != "" :
617+ if token . ent_type_ not in cls . filter_config [ "ents_to_keep" ]:
618+ keep_token = False
619+ if cls . filter_config [ "pos_to_keep" ] is not None and token .pos_ not in cls .filter_config ["pos_to_keep" ]:
620+ keep_token = False
602621 if keep_token is False and cls .keep_all is True :
603622 processed_doc .append (Token ("" , old_token .surface_form , token .pos_ , token .ent_type_ , old_token .ext ))
604623 continue
624+ # print(keep_token)
605625 if cls .options ["spacy_lemmatizer" ] is True :
606626 new_token = Token (token .lemma_ , old_token .surface_form , token .pos_ , token .ent_type_ , old_token .ext )
607627 elif cls .lemmatizer :
0 commit comments