ARTFL-Project
diff --git a/‎tests/__pycache__/test_preprocessor.cpython-313-pytest-9.0.2.pyc‎
2.06 KB b/‎tests/__pycache__/test_preprocessor.cpython-313-pytest-9.0.2.pyc‎
2.06 KB
diff --git a/‎tests/__pycache__/test_spacy_models.cpython-313-pytest-9.0.2.pyc‎
1.81 KB b/‎tests/__pycache__/test_spacy_models.cpython-313-pytest-9.0.2.pyc‎
1.81 KB
diff --git a/‎tests/__pycache__/test_tokens.cpython-313-pytest-9.0.2.pyc‎
2.65 KB b/‎tests/__pycache__/test_tokens.cpython-313-pytest-9.0.2.pyc‎
2.65 KB
diff --git a/‎tests/test_preprocessor.py‎
Lines changed: 13 additions & 0 deletions b/‎tests/test_preprocessor.py‎
Lines changed: 13 additions & 0 deletions
diff --git a/‎tests/test_spacy_models.py‎
Lines changed: 11 additions & 0 deletions b/‎tests/test_spacy_models.py‎
Lines changed: 11 additions & 0 deletions
diff --git a/‎tests/test_tokens.py‎
Lines changed: 22 additions & 0 deletions b/‎tests/test_tokens.py‎
Lines changed: 22 additions & 0 deletions
diff --git a/‎text_preprocessing/preprocessor.py‎
Lines changed: 26 additions & 13 deletions b/‎text_preprocessing/preprocessor.py‎
Lines changed: 26 additions & 13 deletions
diff --git a/‎text_preprocessing/spacy_helpers.py‎
Lines changed: 6 additions & 5 deletions b/‎text_preprocessing/spacy_helpers.py‎
Lines changed: 6 additions & 5 deletions
@@ -381,6 +381,19 @@ def append_marker(tokens):
         assert "world_X" in texts
         assert "foo_X" in texts
 
+    def test_post_processing_function_applied_in_process_string(self):
+        def append_marker(tokens):
+            for tok in tokens:
+                if tok.text and tok.text != " ":
+                    tok.text = tok.text + "_X"
+            return tokens
+
+        p = PreProcessor(language="english", post_processing_function=append_marker, workers=1)
+        texts = words(p.process_string("hello world foo"))
+        assert "hello_X" in texts
+        assert "world_X" in texts
+        assert "foo_X" in texts
+
     # --- workers=1 explicit ---
 
     def test_single_worker_produces_same_output(self):
 
@@ -251,6 +251,17 @@ def test_en_lemmatizer_does_not_alter_already_base_form(self):
         assert "cat" in texts
         assert "dog" in texts
 
+    def test_en_lemmatizer_works_without_pos_to_keep(self):
+        # Tagger must be kept even when pos_to_keep is unset, because the
+        # English rule-based lemmatizer depends on POS annotations.
+        p = PreProcessor(language="english", language_model=EN_MODEL,
+                         lemmatizer="spacy", workers=1)
+        toks = tokens_with_pos(p.process_string("dogs are running in cities"))
+        texts = [t.text for t in toks]
+        assert "dog" in texts       # dogs → dog
+        assert "run" in texts       # running → run
+        assert "city" in texts      # cities → city
+
 
 # ===========================================================================
 # Integration: language model + corpus files
 
@@ -109,6 +109,28 @@ def test_iter_text_values(self):
         tokens = make_tokens(["hello", "world", "foo"])
         assert [t.text for t in tokens] == ["hello", "world", "foo"]
 
+    def test_next_yields_all_tokens(self):
+        tokens = make_tokens(["a", "b", "c"])
+        result = []
+        try:
+            while True:
+                result.append(next(tokens))
+        except StopIteration:
+            pass
+        assert [t.text for t in result] == ["a", "b", "c"]
+
+    def test_next_raises_stop_iteration(self):
+        tokens = make_tokens(["a"])
+        next(tokens)
+        with pytest.raises(StopIteration):
+            next(tokens)
+
+    def test_next_resets_after_exhaustion(self):
+        tokens = make_tokens(["a", "b"])
+        list(tokens)  # exhaust via __iter__
+        # __next__ should work again from the start
+        assert next(tokens).text == "a"
+
     def test_getitem_int(self):
         tokens = make_tokens(["hello", "world"])
         assert tokens[0].text == "hello"
 
@@ -50,7 +50,13 @@ class PreparedDoc:
 
 
 class PreProcessor:
-    """Text Preprocessing class"""
+    """Text preprocessing pipeline.
+
+    Only one instance should be active at a time: configuration is stored on
+    TextFetcher as class variables so forked workers inherit it via
+    copy-on-write, avoiding costly pickling of the Spacy model and language
+    dictionaries.
+    """
 
     def __init__(
         self,
@@ -192,7 +198,10 @@ def process_string(self, text: str, keep_all: bool = True) -> Tokens:
         """Take a string and return a list of preprocessed tokens"""
         doc = self.text_fetcher.process_string(text)
         processed_doc = self.nlp(doc)
-        return Tokens(processed_doc, keep_all=keep_all)
+        tokens = Tokens(processed_doc, keep_all=keep_all)
+        if self.post_func is not None:
+            tokens = self.post_func(tokens)
+        return tokens
 
     def __split_spacy_docs(self, doc: Doc) -> list[Doc]:
         """Split spacy doc into smaller docs of 10 sentences"""
@@ -202,23 +211,27 @@ def __split_spacy_docs(self, doc: Doc) -> list[Doc]:
             if len(sentence_group) == 10:
                 docs.append(Doc.from_docs(sentence_group))
                 sentence_group = []
-            else:
-                sent_starts = []
-                words = []
-                for token in sent:
-                    sent_starts.append(token.is_sent_start)
-                    words.append(token.text)
-                sent_doc = Doc(self.nlp.vocab, words, sent_starts=sent_starts)
-                for pos, token in enumerate(sent):
-                    sent_doc[pos]._.ext = token._.ext
-                sentence_group.append(sent_doc)
+            sent_starts = []
+            words = []
+            for token in sent:
+                sent_starts.append(token.is_sent_start)
+                words.append(token.text)
+            sent_doc = Doc(self.nlp.vocab, words, sent_starts=sent_starts)
+            for pos, token in enumerate(sent):
+                sent_doc[pos]._.ext = token._.ext
+            sentence_group.append(sent_doc)
         if sentence_group:
             docs.append(Doc.from_docs(sentence_group))
         return docs
 
 
 class TextFetcher:
-    """Text fetcher"""
+    """Tokeniser and file reader for PreProcessor.
+
+    Configuration is kept as class variables so that forked worker processes
+    inherit the Spacy model and language dictionaries via copy-on-write without
+    pickling them.  Workers treat this state as read-only.
+    """
 
     word_regex: str = r"[\p{L}\p{M}\p{N}]+|'"
     sentence_boundaries: list[str] = [".", "!", "?"]
 
@@ -130,11 +130,12 @@ def __iter__(self) -> Iterable[PreprocessorToken]:
             yield token
 
     def __next__(self):
+        if self.iter_index >= self.length:
+            self.iter_index = 0
+            raise StopIteration
+        token = self.tokens[self.iter_index]
         self.iter_index += 1
-        if self.iter_index < self.length:
-            return self.tokens[self.iter_index]
-        else:
-            raise IndexError
+        return token
 
     def __getitem__(self, index: Union[int, slice]) -> Union[PreprocessorToken, Iterable[PreprocessorToken]]:
         if isinstance(index, int):
@@ -484,7 +485,7 @@ def load_language_model(language_model, normalize_options: dict[str, Any]) -> tu
         )
     ):
         disabled_pipelines = ["tokenizer", "textcat"]
-        if not normalize_options["pos_to_keep"]:
+        if not normalize_options["pos_to_keep"] and normalize_options["lemmatizer"] != "spacy":
             disabled_pipelines.append("tagger")
         if not normalize_options["ents_to_keep"]:
             disabled_pipelines.append("ner")