Skip to content

Commit e23f425

Browse files
committed
Enable running Spacy models through GPU
1 parent 904806f commit e23f425

2 files changed

Lines changed: 51 additions & 31 deletions

File tree

text_preprocessing/preprocessor.py

Lines changed: 46 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,12 @@
11
#!/usr/bin/env python3
22
"""Text Preprocessor"""
33

4-
54
import json
65
import os
76
import sqlite3
87
import sys
98
from collections import defaultdict, deque
9+
from dataclasses import dataclass
1010
from itertools import combinations
1111
from typing import Any, Callable, DefaultDict, Deque, Iterable, Iterator, Union, overload
1212

@@ -39,6 +39,17 @@
3939
PHILO_OBJECT_LEVEL: dict[int, str] = {1: "doc", 2: "div1", 3: "div2", 4: "div3", 5: "para", 6: "sent", 7: "word"}
4040

4141

42+
@dataclass(slots=True)
43+
class PreparedDoc:
44+
"""Prepared doc for conversion to Spacy Doc object"""
45+
46+
words: list[str]
47+
sent_starts: list[bool]
48+
metadata: dict[str, Any]
49+
exts: list[dict[str, Any]]
50+
char_num: int
51+
52+
4253
class PreprocessorToken(str):
4354
"""Token Object class inheriting from string
4455
@@ -337,7 +348,7 @@ def __init__(
337348
"ents_to_keep": ents_to_keep or [],
338349
}
339350
self.language = language
340-
self.nlp = load_language_model(self.language, self.normalize_options, text_object_type)
351+
self.nlp = load_language_model(self.language, self.normalize_options)
341352
if workers is None:
342353
cpu_count = os.cpu_count() or 2
343354
self.workers = cpu_count - 1
@@ -390,17 +401,15 @@ def process_texts(
390401
end="",
391402
flush=True,
392403
)
393-
if isinstance(tokens, Doc):
394-
if self.text_fetcher.text_object_type in ("sent", "para"):
395-
tokens = Tokens(tokens, keep_all=keep_all)
404+
if isinstance(tokens, PreparedDoc):
405+
spacy_doc = make_spacy_doc(self.nlp, tokens)
406+
if spacy_doc._.char_num > 10000:
407+
split_doc = self.__split_spacy_docs(spacy_doc)
408+
rebuilt_doc = Doc.from_docs(list(self.nlp.pipe(split_doc)))
409+
rebuilt_doc._.metadata = spacy_doc._.metadata
410+
tokens = Tokens(rebuilt_doc, keep_all=keep_all)
396411
else:
397-
if tokens._.char_num > 10000:
398-
split_doc = self.__split_spacy_docs(tokens)
399-
rebuilt_doc = Doc.from_docs(list(self.nlp.pipe(split_doc)))
400-
rebuilt_doc._.metadata = tokens._.metadata
401-
tokens = Tokens(rebuilt_doc, keep_all=keep_all)
402-
else:
403-
tokens = Tokens(self.nlp(tokens), keep_all=keep_all)
412+
tokens = Tokens(self.nlp(spacy_doc), keep_all=keep_all)
404413
if self.ngram_config is not None:
405414
tokens = generate_ngrams(**self.ngram_config, tokens=tokens)
406415
if self.post_func is not None:
@@ -416,11 +425,11 @@ def process_string(self, text: str, keep_all: bool = True) -> Tokens:
416425
return Tokens(processed_doc, keep_all=keep_all)
417426

418427
def __split_spacy_docs(self, doc: Doc) -> list[Doc]:
419-
"""Split spacy doc into smaller docs of 50 sentences"""
428+
"""Split spacy doc into smaller docs of 10 sentences"""
420429
sentence_group: list[Doc] = []
421430
docs: list[Doc] = []
422431
for sent in doc.sents:
423-
if len(sentence_group) == 50:
432+
if len(sentence_group) == 10:
424433
docs.append(Doc.from_docs(sentence_group))
425434
sentence_group = []
426435
else:
@@ -495,7 +504,7 @@ def __call__(
495504
keep_all=False,
496505
progress: bool = True,
497506
post_func: Callable | None = None,
498-
) -> Iterable[tuple[Doc | Tokens, int]]:
507+
) -> Iterable[tuple[PreparedDoc | Tokens, int]]:
499508
"""Process all documents. Returns an iterator of documents"""
500509
doc_count: int = 0
501510
if progress is True:
@@ -510,15 +519,16 @@ def __call__(
510519
yield doc, doc_count
511520

512521
@classmethod
513-
def __local_process(cls, args) -> Iterable[Doc | Tokens]:
522+
def __local_process(cls, args) -> Iterable[PreparedDoc | Tokens]:
514523
text, do_nlp, keep_all, post_func = args
515524
if cls.is_philo_db is True:
516525
text_objects, sent_starts_list, metadata = cls.process_philo_text(text)
517526
else:
518527
text_objects, sent_starts_list, metadata = cls.process_text(text)
519-
spacy_docs = cls.__make_spacy_doc(text_objects, sent_starts_list, metadata)
528+
docs = cls.__prepare_docs(text_objects, sent_starts_list, metadata)
520529
if do_nlp is True:
521-
return list(spacy_docs)
530+
return docs
531+
spacy_docs = (make_spacy_doc(cls.model, doc) for doc in docs)
522532
tokens_list: list[Tokens] = []
523533
for spacy_doc in spacy_docs:
524534
tokens = Tokens(cls.model(spacy_doc), keep_all=keep_all)
@@ -530,20 +540,19 @@ def __local_process(cls, args) -> Iterable[Doc | Tokens]:
530540
return tokens_list
531541

532542
@classmethod
533-
def __make_spacy_doc(cls, text_objects, sent_starts_list, metadata) -> Iterable[Doc]:
534-
"""Make spacy doc from list of tokens"""
543+
def __prepare_docs(cls, text_objects, sent_starts_list, metadata) -> list[PreparedDoc]:
544+
"""Prepare doc for creating Spacy doc"""
545+
list_doc_words: list[PreparedDoc] = []
535546
for processed_doc, sent_starts, local_metadata in zip(text_objects, sent_starts_list, metadata):
536547
words = []
548+
exts = []
537549
char_num = 0
538-
for word, _ in processed_doc:
550+
for word, ext in processed_doc:
539551
char_num += len(word)
540552
words.append(word)
541-
doc = Doc(cls.model.vocab, words, sent_starts=sent_starts)
542-
doc._.metadata = local_metadata
543-
doc._.char_num = char_num
544-
for pos, (_, ext) in enumerate(processed_doc):
545-
doc[pos]._.ext = ext
546-
yield doc
553+
exts.append(ext)
554+
list_doc_words.append(PreparedDoc(words, sent_starts, local_metadata, exts, char_num))
555+
return list_doc_words
547556

548557
@classmethod
549558
def process_text(cls, text: str):
@@ -733,6 +742,16 @@ def recursive_search(
733742
return obj_metadata, metadata_cache
734743

735744

745+
def make_spacy_doc(model: Language, prepared_doc: PreparedDoc) -> Doc:
746+
"""Make Spacy doc"""
747+
doc = Doc(model.vocab, prepared_doc.words, sent_starts=prepared_doc.sent_starts)
748+
doc._.metadata = prepared_doc.metadata
749+
doc._.char_num = prepared_doc.char_num
750+
for pos, ext in enumerate(prepared_doc.exts):
751+
doc[pos]._.ext = ext
752+
return doc
753+
754+
736755
def main():
737756
"""Performance testing"""
738757
import timeit

text_preprocessing/spacy_helpers.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,8 @@
1111
import spacy
1212
from spacy.language import Language
1313
from spacy.tokens import Doc, Token
14+
from thinc.api import set_gpu_allocator, prefer_gpu
15+
1416
from Stemmer import Stemmer
1517
from unidecode import unidecode
1618

@@ -250,7 +252,7 @@ def __normalize_token(self, orig_token: Token) -> str:
250252
return token
251253

252254

253-
def load_language_model(language, normalize_options: dict[str, Any], text_object_type: str) -> Language:
255+
def load_language_model(language, normalize_options: dict[str, Any]) -> Language:
254256
"""Load language model based on name"""
255257
nlp = None
256258
language = language.lower()
@@ -275,7 +277,8 @@ def load_language_model(language, normalize_options: dict[str, Any], text_object
275277
if not normalize_options["ents_to_keep"]:
276278
diabled_pipelines.append("ner")
277279
model_loaded = ""
278-
# spacy.prefer_gpu()
280+
set_gpu_allocator("pytorch")
281+
prefer_gpu()
279282
for model in possible_models:
280283
try:
281284
nlp = spacy.load(model, exclude=diabled_pipelines)
@@ -292,8 +295,6 @@ def load_language_model(language, normalize_options: dict[str, Any], text_object
292295
if normalize_options["ents_to_keep"] and "ner" not in nlp.pipe_names:
293296
print(f"There is no NER pipeline for model {model_loaded}. Exiting...")
294297
exit(-1)
295-
# if text_object_type == "doc":
296-
# nlp.batch_size = 16
297298
return nlp
298299
nlp = spacy.blank("en")
299300
nlp.add_pipe("postprocessor", config=normalize_options, last=True)

0 commit comments

Comments
 (0)