Advanced NLP with spaCy
Ines Montani
spaCy core developer
nlp.pipe
methodDoc
objectsnlp
on each textBAD:
docs = [nlp(text) for text in LOTS_OF_TEXTS]
GOOD:
docs = list(nlp.pipe(LOTS_OF_TEXTS))
as_tuples=True
on nlp.pipe
lets you pass in (text, context)
tuples(doc, context)
tuplesdoc
data = [ ('This is a text', {'id': 1, 'page_number': 15}), ('And another text', {'id': 2, 'page_number': 16}), ]
for doc, context in nlp.pipe(data, as_tuples=True): print(doc.text, context['page_number'])
This is a text 15
And another text 16
from spacy.tokens import Doc Doc.set_extension('id', default=None) Doc.set_extension('page_number', default=None)
data = [ ('This is a text', {'id': 1, 'page_number': 15}), ('And another text', {'id': 2, 'page_number': 16}), ] for doc, context in nlp.pipe(data, as_tuples=True): doc._.id = context['id'] doc._.page_number = context['page_number']
nlp.make_doc
to turn a text in to a Doc
objectBAD:
doc = nlp("Hello world")
GOOD:
doc = nlp.make_doc("Hello world!")
nlp.disable_pipes
to temporarily disable one or more pipes# Disable tagger and parser
with nlp.disable_pipes('tagger', 'parser'):
# Process the text and print the entities
doc = nlp(text)
print(doc.ents)
with
blockAdvanced NLP with spaCy