#属性
text A string representation of the document text.str
text_with_ws An alias of Doc.text, provided for duck-type compatibility with Span and Token.str
mem The document’s local memory heap,forall C data it owns.
cymem.Pool
vocab The store of lexical types.
Vocab
tensor Container for dense vector representations.
numpy.ndarray
user_data A generic storage area,for user custom data.
Dict[str, Any]
lang Language of the document’s vocabulary.int
lang_ Language of the document’s vocabulary.str
sentiment The document’s positivity/negativity score,if available.float
user_hooks A dictionary that allows customization of the Doc’s properties.
Dict[str, Callable]
user_token_hooks A dictionary that allows customization of properties of Token children.
Dict[str, Callable]
user_span_hooks A dictionary that allows customization of properties of Span children.
Dict[str, Callable]
has_unknown_spaces Whether the document was constructed without known spacing between tokens (typically when created from gold tokenization).bool
_ User space for adding custom attribute extensions.
Underscore
#两种构建方法# Construction 1
doc = nlp("Some text")# Construction 2from spacy.tokens import Doc
words =["hello","world","!"]
spaces =[True,False,False]
doc = Doc(nlp.vocab, words=words, spaces=spaces)
#得到span或token的方法
doc = nlp("Give it back! He pleaded.")assert doc[0].text =="Give"assert doc[-1].text =="."
span = doc[1:3]assert span.text =="it back"
#迭代器
doc = nlp("Give it back")assert[t.text for t in doc]==["Give","it","back"]
#长度
doc = nlp("Give it back! He pleaded.")assertlen(doc)==7
#设置扩展属性或方法from spacy.tokens import Doc
city_getter =lambda doc:any(city in doc.text for city in("New York","Paris","Berlin"))
Doc.set_extension("has_city", getter=city_getter)
doc = nlp("I like New York")assert doc._.has_city
#获得字符切片span
doc = nlp("I like New York")
span = doc.char_span(7,15, label="GPE")assert span.text =="New York"
#设置实体名字,给几个词上标签from spacy.tokens import Span
doc = nlp("Mr. Best flew to New York on Saturday morning.")
doc.set_ents([Span(doc,0,2,"PERSON")])
ents =list(doc.ents)assert ents[0].label_ =="PERSON"assert ents[0].text =="Mr. Best"
#判断两个句子相似度
apples = nlp("I like apples")
oranges = nlp("I like oranges")
apples_oranges = apples.similarity(oranges)
oranges_apples = oranges.similarity(apples)assert apples_oranges == oranges_apples
#是否有特殊标记
doc = nlp("This is a text")assert doc.has_annotation("DEP")
#迅速转化信息为numpy格式from spacy.attrs import LOWER, POS, ENT_TYPE, IS_ALPHA
doc = nlp(text)# All strings mapped to integers, for easy export to numpy
np_array = doc.to_array([LOWER, POS, ENT_TYPE, IS_ALPHA])
np_array = doc.to_array("POS")
#从numpy转化为docfrom spacy.attrs import LOWER, POS, ENT_TYPE, IS_ALPHA
from spacy.tokens import Doc
doc = nlp("Hello world!")
np_array = doc.to_array([LOWER, POS, ENT_TYPE, IS_ALPHA])
doc2 = Doc(doc.vocab, words=[t.text for t in doc])
doc2.from_array([LOWER, POS, ENT_TYPE, IS_ALPHA], np_array)assert doc[0].pos_ == doc2[0].pos_
#多个句子docsfrom spacy.tokens import Doc
texts =["London is the capital of the United Kingdom.","The River Thames flows through London.","The famous Tower Bridge crosses the River Thames."]
docs =list(nlp.pipe(texts))
c_doc = Doc.from_docs(docs)assertstr(c_doc)==" ".join(texts)assertlen(list(c_doc.sents))==len(docs)assert[str(ent)for ent in c_doc.ents]==[str(ent)for doc in docs for ent in doc.ents]
#变成字节流
doc = nlp("Give it back! He pleaded.")
doc_bytes = doc.to_bytes()
#从字节流变回from spacy.tokens import Doc
doc = nlp("Give it back! He pleaded.")
doc_bytes = doc.to_bytes()
doc2 = Doc(doc.vocab).from_bytes(doc_bytes)assert doc.text == doc2.text
#转化为json格式
doc = nlp("All we have to decide is what to do with the time that is given us.")assert doc.to_json()["text"]== doc.text
#json格式转化为docfrom spacy.tokens import Doc
doc = nlp("All we have to decide is what to do with the time that is given us.")
doc_json = doc.to_json()
deserialized_doc = Doc(nlp.vocab).from_json(doc_json)assert deserialized_doc.text == doc.text == doc_json["text"]
#没看懂,好心人看懂了欢迎留言评论
doc = nlp("Hello world!")with doc.retokenize()as retokenizer:
retokenizer.merge(doc[0:2])
doc = nlp("I like David Bowie")with doc.retokenize()as retokenizer:
attrs ={"LEMMA":"David Bowie"}
retokenizer.merge(doc[2:4], attrs=attrs)
doc = nlp("I live in NewYork")with doc.retokenize()as retokenizer:
heads =[(doc[3],1), doc[2]]
attrs ={"POS":["PROPN","PROPN"],"DEP":["pobj","compound"]}
retokenizer.split(doc[3],["New","York"], heads=heads, attrs=attrs)
#得到实体标签
doc = nlp("Mr. Best flew to New York on Saturday morning.")
ents =list(doc.ents)assert ents[0].label_ =="PERSON"assert ents[0].text =="Mr. Best"
#得到span
doc = nlp("Their goi ng home")
doc.spans["errors"]=[doc[0:1], doc[1:3]]
#返回一个记录每个词的得分的字典
doc = nlp("This is a text about football.")print(doc.cats)
#返回名词块列表
doc = nlp("A phrase with another phrase occurs.")
chunks =list(doc.noun_chunks)assertlen(chunks)==2assert chunks[0].text =="A phrase"assert chunks[1].text =="another phrase"
#返回句子列表
doc = nlp("This is a sentence. Here's another...")
sents =list(doc.sents)assertlen(sents)==2assert[s.root.text for s in sents]==["is","'s"]
#句子向量
doc = nlp("I like apples")assert doc.vector.dtype =="float32"assert doc.vector.shape ==(300,)
#是否有句子向量
doc = nlp("I like apples")assert doc.has_vector
#得到句子向量的L2范式
doc1 = nlp("I like apples")
doc2 = nlp("I like oranges")
doc1.vector_norm # 4.54232424414368
doc2.vector_norm # 3.304373298575751assert doc1.vector_norm != doc2.vector_norm