@@ -393,14 +393,25 @@ def process_texts(
393393 fetched_texts = self .text_fetcher (
394394 texts , do_nlp = self .do_nlp , keep_all = keep_all , progress = progress , post_func = self .post_func
395395 )
396+ if self .text_fetcher .text_object_type in ("para" , "sent" ):
397+ fetched_texts = self .nlp .pipe (
398+ ((make_spacy_doc (self .nlp , tokens ), c ) for tokens , c in fetched_texts ), as_tuples = True
399+ )
396400 for tokens , doc_count in fetched_texts :
397401 count += 1
398402 if progress is True :
399- print (
400- f"\r { progress_prefix } { doc_count } done: { count } text objects extracted... " ,
401- end = "" ,
402- flush = True ,
403- )
403+ if doc_count is not None : # workaround for sent and para since nlp.pipe does not return context...
404+ print (
405+ f"\r { progress_prefix } { doc_count } done: { count } text objects extracted... " ,
406+ end = "" ,
407+ flush = True ,
408+ )
409+ else :
410+ print (
411+ f"\r { progress_prefix } { count } text objects extracted... " ,
412+ end = "" ,
413+ flush = True ,
414+ )
404415 if isinstance (tokens , PreparedDoc ):
405416 spacy_doc = make_spacy_doc (self .nlp , tokens )
406417 if spacy_doc ._ .char_num > 10000 :
@@ -415,6 +426,13 @@ def process_texts(
415426 if self .post_func is not None :
416427 processed_doc = self .post_func (tokens )
417428 yield processed_doc
429+ elif isinstance (tokens , Doc ):
430+ tokens = Tokens (tokens , keep_all = keep_all )
431+ if self .ngram_config is not None :
432+ tokens = generate_ngrams (** self .ngram_config , tokens = tokens )
433+ if self .post_func is not None :
434+ tokens = self .post_func (tokens )
435+ yield tokens
418436 else :
419437 yield tokens
420438
0 commit comments