Skip to content

Commit e41cd9d

Browse files
committed
optimize for sent and para objects
1 parent e23f425 commit e41cd9d

1 file changed

Lines changed: 23 additions & 5 deletions

File tree

text_preprocessing/preprocessor.py

Lines changed: 23 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -393,14 +393,25 @@ def process_texts(
393393
fetched_texts = self.text_fetcher(
394394
texts, do_nlp=self.do_nlp, keep_all=keep_all, progress=progress, post_func=self.post_func
395395
)
396+
if self.text_fetcher.text_object_type in ("para", "sent"):
397+
fetched_texts = self.nlp.pipe(
398+
((make_spacy_doc(self.nlp, tokens), c) for tokens, c in fetched_texts), as_tuples=True
399+
)
396400
for tokens, doc_count in fetched_texts:
397401
count += 1
398402
if progress is True:
399-
print(
400-
f"\r{progress_prefix} {doc_count} done: {count} text objects extracted... ",
401-
end="",
402-
flush=True,
403-
)
403+
if doc_count is not None: # workaround for sent and para since nlp.pipe does not return context...
404+
print(
405+
f"\r{progress_prefix} {doc_count} done: {count} text objects extracted... ",
406+
end="",
407+
flush=True,
408+
)
409+
else:
410+
print(
411+
f"\r{progress_prefix} {count} text objects extracted... ",
412+
end="",
413+
flush=True,
414+
)
404415
if isinstance(tokens, PreparedDoc):
405416
spacy_doc = make_spacy_doc(self.nlp, tokens)
406417
if spacy_doc._.char_num > 10000:
@@ -415,6 +426,13 @@ def process_texts(
415426
if self.post_func is not None:
416427
processed_doc = self.post_func(tokens)
417428
yield processed_doc
429+
elif isinstance(tokens, Doc):
430+
tokens = Tokens(tokens, keep_all=keep_all)
431+
if self.ngram_config is not None:
432+
tokens = generate_ngrams(**self.ngram_config, tokens=tokens)
433+
if self.post_func is not None:
434+
tokens = self.post_func(tokens)
435+
yield tokens
418436
else:
419437
yield tokens
420438

0 commit comments

Comments
 (0)