11#!/usr/bin/env python3
22"""Text Preprocessor"""
33
4-
54import json
65import os
76import sqlite3
87import sys
98from collections import defaultdict , deque
9+ from dataclasses import dataclass
1010from itertools import combinations
1111from typing import Any , Callable , DefaultDict , Deque , Iterable , Iterator , Union , overload
1212
3939PHILO_OBJECT_LEVEL : dict [int , str ] = {1 : "doc" , 2 : "div1" , 3 : "div2" , 4 : "div3" , 5 : "para" , 6 : "sent" , 7 : "word" }
4040
4141
42+ @dataclass (slots = True )
43+ class PreparedDoc :
44+ """Prepared doc for conversion to Spacy Doc object"""
45+
46+ words : list [str ]
47+ sent_starts : list [bool ]
48+ metadata : dict [str , Any ]
49+ exts : list [dict [str , Any ]]
50+ char_num : int
51+
52+
4253class PreprocessorToken (str ):
4354 """Token Object class inheriting from string
4455
@@ -337,7 +348,7 @@ def __init__(
337348 "ents_to_keep" : ents_to_keep or [],
338349 }
339350 self .language = language
340- self .nlp = load_language_model (self .language , self .normalize_options , text_object_type )
351+ self .nlp = load_language_model (self .language , self .normalize_options )
341352 if workers is None :
342353 cpu_count = os .cpu_count () or 2
343354 self .workers = cpu_count - 1
@@ -390,17 +401,15 @@ def process_texts(
390401 end = "" ,
391402 flush = True ,
392403 )
393- if isinstance (tokens , Doc ):
394- if self .text_fetcher .text_object_type in ("sent" , "para" ):
395- tokens = Tokens (tokens , keep_all = keep_all )
404+ if isinstance (tokens , PreparedDoc ):
405+ spacy_doc = make_spacy_doc (self .nlp , tokens )
406+ if spacy_doc ._ .char_num > 10000 :
407+ split_doc = self .__split_spacy_docs (spacy_doc )
408+ rebuilt_doc = Doc .from_docs (list (self .nlp .pipe (split_doc )))
409+ rebuilt_doc ._ .metadata = spacy_doc ._ .metadata
410+ tokens = Tokens (rebuilt_doc , keep_all = keep_all )
396411 else :
397- if tokens ._ .char_num > 10000 :
398- split_doc = self .__split_spacy_docs (tokens )
399- rebuilt_doc = Doc .from_docs (list (self .nlp .pipe (split_doc )))
400- rebuilt_doc ._ .metadata = tokens ._ .metadata
401- tokens = Tokens (rebuilt_doc , keep_all = keep_all )
402- else :
403- tokens = Tokens (self .nlp (tokens ), keep_all = keep_all )
412+ tokens = Tokens (self .nlp (spacy_doc ), keep_all = keep_all )
404413 if self .ngram_config is not None :
405414 tokens = generate_ngrams (** self .ngram_config , tokens = tokens )
406415 if self .post_func is not None :
@@ -416,11 +425,11 @@ def process_string(self, text: str, keep_all: bool = True) -> Tokens:
416425 return Tokens (processed_doc , keep_all = keep_all )
417426
418427 def __split_spacy_docs (self , doc : Doc ) -> list [Doc ]:
419- """Split spacy doc into smaller docs of 50 sentences"""
428+ """Split spacy doc into smaller docs of 10 sentences"""
420429 sentence_group : list [Doc ] = []
421430 docs : list [Doc ] = []
422431 for sent in doc .sents :
423- if len (sentence_group ) == 50 :
432+ if len (sentence_group ) == 10 :
424433 docs .append (Doc .from_docs (sentence_group ))
425434 sentence_group = []
426435 else :
@@ -495,7 +504,7 @@ def __call__(
495504 keep_all = False ,
496505 progress : bool = True ,
497506 post_func : Callable | None = None ,
498- ) -> Iterable [tuple [Doc | Tokens , int ]]:
507+ ) -> Iterable [tuple [PreparedDoc | Tokens , int ]]:
499508 """Process all documents. Returns an iterator of documents"""
500509 doc_count : int = 0
501510 if progress is True :
@@ -510,15 +519,16 @@ def __call__(
510519 yield doc , doc_count
511520
512521 @classmethod
513- def __local_process (cls , args ) -> Iterable [Doc | Tokens ]:
522+ def __local_process (cls , args ) -> Iterable [PreparedDoc | Tokens ]:
514523 text , do_nlp , keep_all , post_func = args
515524 if cls .is_philo_db is True :
516525 text_objects , sent_starts_list , metadata = cls .process_philo_text (text )
517526 else :
518527 text_objects , sent_starts_list , metadata = cls .process_text (text )
519- spacy_docs = cls .__make_spacy_doc (text_objects , sent_starts_list , metadata )
528+ docs = cls .__prepare_docs (text_objects , sent_starts_list , metadata )
520529 if do_nlp is True :
521- return list (spacy_docs )
530+ return docs
531+ spacy_docs = (make_spacy_doc (cls .model , doc ) for doc in docs )
522532 tokens_list : list [Tokens ] = []
523533 for spacy_doc in spacy_docs :
524534 tokens = Tokens (cls .model (spacy_doc ), keep_all = keep_all )
@@ -530,20 +540,19 @@ def __local_process(cls, args) -> Iterable[Doc | Tokens]:
530540 return tokens_list
531541
532542 @classmethod
533- def __make_spacy_doc (cls , text_objects , sent_starts_list , metadata ) -> Iterable [Doc ]:
534- """Make spacy doc from list of tokens"""
543+ def __prepare_docs (cls , text_objects , sent_starts_list , metadata ) -> list [PreparedDoc ]:
544+ """Prepare doc for creating Spacy doc"""
545+ list_doc_words : list [PreparedDoc ] = []
535546 for processed_doc , sent_starts , local_metadata in zip (text_objects , sent_starts_list , metadata ):
536547 words = []
548+ exts = []
537549 char_num = 0
538- for word , _ in processed_doc :
550+ for word , ext in processed_doc :
539551 char_num += len (word )
540552 words .append (word )
541- doc = Doc (cls .model .vocab , words , sent_starts = sent_starts )
542- doc ._ .metadata = local_metadata
543- doc ._ .char_num = char_num
544- for pos , (_ , ext ) in enumerate (processed_doc ):
545- doc [pos ]._ .ext = ext
546- yield doc
553+ exts .append (ext )
554+ list_doc_words .append (PreparedDoc (words , sent_starts , local_metadata , exts , char_num ))
555+ return list_doc_words
547556
548557 @classmethod
549558 def process_text (cls , text : str ):
@@ -733,6 +742,16 @@ def recursive_search(
733742 return obj_metadata , metadata_cache
734743
735744
745+ def make_spacy_doc (model : Language , prepared_doc : PreparedDoc ) -> Doc :
746+ """Make Spacy doc"""
747+ doc = Doc (model .vocab , prepared_doc .words , sent_starts = prepared_doc .sent_starts )
748+ doc ._ .metadata = prepared_doc .metadata
749+ doc ._ .char_num = prepared_doc .char_num
750+ for pos , ext in enumerate (prepared_doc .exts ):
751+ doc [pos ]._ .ext = ext
752+ return doc
753+
754+
736755def main ():
737756 """Performance testing"""
738757 import timeit
0 commit comments