@@ -50,7 +50,13 @@ class PreparedDoc:
5050
5151
5252class PreProcessor :
53- """Text Preprocessing class"""
53+ """Text preprocessing pipeline.
54+
55+ Only one instance should be active at a time: configuration is stored on
56+ TextFetcher as class variables so forked workers inherit it via
57+ copy-on-write, avoiding costly pickling of the Spacy model and language
58+ dictionaries.
59+ """
5460
5561 def __init__ (
5662 self ,
@@ -192,7 +198,10 @@ def process_string(self, text: str, keep_all: bool = True) -> Tokens:
192198 """Take a string and return a list of preprocessed tokens"""
193199 doc = self .text_fetcher .process_string (text )
194200 processed_doc = self .nlp (doc )
195- return Tokens (processed_doc , keep_all = keep_all )
201+ tokens = Tokens (processed_doc , keep_all = keep_all )
202+ if self .post_func is not None :
203+ tokens = self .post_func (tokens )
204+ return tokens
196205
197206 def __split_spacy_docs (self , doc : Doc ) -> list [Doc ]:
198207 """Split spacy doc into smaller docs of 10 sentences"""
@@ -202,23 +211,27 @@ def __split_spacy_docs(self, doc: Doc) -> list[Doc]:
202211 if len (sentence_group ) == 10 :
203212 docs .append (Doc .from_docs (sentence_group ))
204213 sentence_group = []
205- else :
206- sent_starts = []
207- words = []
208- for token in sent :
209- sent_starts .append (token .is_sent_start )
210- words .append (token .text )
211- sent_doc = Doc (self .nlp .vocab , words , sent_starts = sent_starts )
212- for pos , token in enumerate (sent ):
213- sent_doc [pos ]._ .ext = token ._ .ext
214- sentence_group .append (sent_doc )
214+ sent_starts = []
215+ words = []
216+ for token in sent :
217+ sent_starts .append (token .is_sent_start )
218+ words .append (token .text )
219+ sent_doc = Doc (self .nlp .vocab , words , sent_starts = sent_starts )
220+ for pos , token in enumerate (sent ):
221+ sent_doc [pos ]._ .ext = token ._ .ext
222+ sentence_group .append (sent_doc )
215223 if sentence_group :
216224 docs .append (Doc .from_docs (sentence_group ))
217225 return docs
218226
219227
220228class TextFetcher :
221- """Text fetcher"""
229+ """Tokeniser and file reader for PreProcessor.
230+
231+ Configuration is kept as class variables so that forked worker processes
232+ inherit the Spacy model and language dictionaries via copy-on-write without
233+ pickling them. Workers treat this state as read-only.
234+ """
222235
223236 word_regex : str = r"[\p{L}\p{M}\p{N}]+|'"
224237 sentence_boundaries : list [str ] = ["." , "!" , "?" ]
0 commit comments