spacy的每个句子doc的属性和方法

#属性
text	A string representation of the document text.
str

text_with_ws	An alias of Doc.text, provided for duck-type compatibility with Span and Token.
str

mem	The document’s local memory heap, for all C data it owns.
cymem.Pool

vocab	The store of lexical types.
Vocab

tensor	Container for dense vector representations.
numpy.ndarray

user_data	A generic storage area, for user custom data.
Dict[str, Any]

lang	Language of the document’s vocabulary.
int

lang_	Language of the document’s vocabulary.
str

sentiment	The document’s positivity/negativity score, if available.
float

user_hooks	A dictionary that allows customization of the Doc’s properties.
Dict[str, Callable]

user_token_hooks	A dictionary that allows customization of properties of Token children.
Dict[str, Callable]

user_span_hooks	A dictionary that allows customization of properties of Span children.
Dict[str, Callable]

has_unknown_spaces	Whether the document was constructed without known spacing between tokens (typically when created from gold tokenization).
bool

_	User space for adding custom attribute extensions.
Underscore
#两种构建方法
# Construction 1
doc = nlp("Some text")

# Construction 2
from spacy.tokens import Doc

words = ["hello", "world", "!"]
spaces = [True, False, False]
doc = Doc(nlp.vocab, words=words, spaces=spaces)
#得到span或token的方法
doc = nlp("Give it back! He pleaded.")
assert doc[0].text == "Give"
assert doc[-1].text == "."
span = doc[1:3]
assert span.text == "it back"
#迭代器
doc = nlp("Give it back")
assert [t.text for t in doc] == ["Give", "it", "back"]
#长度
doc = nlp("Give it back! He pleaded.")
assert len(doc) == 7
#设置扩展属性或方法
from spacy.tokens import Doc
city_getter = lambda doc: any(city in doc.text for city in ("New York", "Paris", "Berlin"))
Doc.set_extension("has_city", getter=city_getter)
doc = nlp("I like New York")
assert doc._.has_city
#获得扩展东西
from spacy.tokens import Doc
Doc.set_extension("has_city", default=False)
extension = Doc.get_extension("has_city")
assert extension == (False, None, None, None)
#判断是否有扩展属性
from spacy.tokens import Doc
Doc.set_extension("has_city", default=False)
assert Doc.has_extension("has_city")
#删除扩展属性
from spacy.tokens import Doc
Doc.set_extension("has_city", default=False)
removed = Doc.remove_extension("has_city")
assert not Doc.has_extension("has_city")
#获得字符切片span
doc = nlp("I like New York")
span = doc.char_span(7, 15, label="GPE")
assert span.text == "New York"
#设置实体名字,给几个词上标签
from spacy.tokens import Span
doc = nlp("Mr. Best flew to New York on Saturday morning.")
doc.set_ents([Span(doc, 0, 2, "PERSON")])
ents = list(doc.ents)
assert ents[0].label_ == "PERSON"
assert ents[0].text == "Mr. Best"
#判断两个句子相似度
apples = nlp("I like apples")
oranges = nlp("I like oranges")
apples_oranges = apples.similarity(oranges)
oranges_apples = oranges.similarity(apples)
assert apples_oranges == oranges_apples
#统计词出现的次数
from spacy.attrs import ORTH
doc = nlp("apple apple orange banana")
assert doc.count_by(ORTH) == {7024: 1, 119552: 1, 2087: 2}
doc.to_array([ORTH])
# array([[11880], [11880], [7561], [12800]])
doc = nlp("This is a test")
matrix = doc.get_lca_matrix()
# array([[0, 1, 1, 1], [1, 1, 1, 1], [1, 1, 2, 3], [1, 1, 3, 3]], dtype=int32)
#是否有特殊标记
doc = nlp("This is a text")
assert doc.has_annotation("DEP")
#迅速转化信息为numpy格式
from spacy.attrs import LOWER, POS, ENT_TYPE, IS_ALPHA
doc = nlp(text)
# All strings mapped to integers, for easy export to numpy
np_array = doc.to_array([LOWER, POS, ENT_TYPE, IS_ALPHA])
np_array = doc.to_array("POS")
#从numpy转化为doc
from spacy.attrs import LOWER, POS, ENT_TYPE, IS_ALPHA
from spacy.tokens import Doc
doc = nlp("Hello world!")
np_array = doc.to_array([LOWER, POS, ENT_TYPE, IS_ALPHA])
doc2 = Doc(doc.vocab, words=[t.text for t in doc])
doc2.from_array([LOWER, POS, ENT_TYPE, IS_ALPHA], np_array)
assert doc[0].pos_ == doc2[0].pos_
#多个句子docs
from spacy.tokens import Doc
texts = ["London is the capital of the United Kingdom.",
         "The River Thames flows through London.",
         "The famous Tower Bridge crosses the River Thames."]
docs = list(nlp.pipe(texts))
c_doc = Doc.from_docs(docs)
assert str(c_doc) == " ".join(texts)
assert len(list(c_doc.sents)) == len(docs)
assert [str(ent) for ent in c_doc.ents] ==        [str(ent) for doc in docs for ent in doc.ents]
#保存doc
doc.to_disk("/path/to/doc")
#从文件中得到doc
from spacy.tokens import Doc
from spacy.vocab import Vocab
doc = Doc(Vocab()).from_disk("/path/to/doc")
#变成字节流
doc = nlp("Give it back! He pleaded.")
doc_bytes = doc.to_bytes()
#从字节流变回
from spacy.tokens import Doc
doc = nlp("Give it back! He pleaded.")
doc_bytes = doc.to_bytes()
doc2 = Doc(doc.vocab).from_bytes(doc_bytes)
assert doc.text == doc2.text
#转化为json格式
doc = nlp("All we have to decide is what to do with the time that is given us.")
assert doc.to_json()["text"] == doc.text
#json格式转化为doc
from spacy.tokens import Doc
doc = nlp("All we have to decide is what to do with the time that is given us.")
doc_json = doc.to_json()
deserialized_doc = Doc(nlp.vocab).from_json(doc_json)
assert deserialized_doc.text == doc.text == doc_json["text"]
#没看懂,好心人看懂了欢迎留言评论
doc = nlp("Hello world!")
with doc.retokenize() as retokenizer:
    retokenizer.merge(doc[0:2])
    
doc = nlp("I like David Bowie")
with doc.retokenize() as retokenizer:
    attrs = {"LEMMA": "David Bowie"}
    retokenizer.merge(doc[2:4], attrs=attrs)
    
doc = nlp("I live in NewYork")
with doc.retokenize() as retokenizer:
    heads = [(doc[3], 1), doc[2]]
    attrs = {"POS": ["PROPN", "PROPN"],
             "DEP": ["pobj", "compound"]}
    retokenizer.split(doc[3], ["New", "York"], heads=heads, attrs=attrs)
#得到实体标签
doc = nlp("Mr. Best flew to New York on Saturday morning.")
ents = list(doc.ents)
assert ents[0].label_ == "PERSON"
assert ents[0].text == "Mr. Best"
#得到span
doc = nlp("Their goi ng home")
doc.spans["errors"] = [doc[0:1], doc[1:3]]
#返回一个记录每个词的得分的字典
doc = nlp("This is a text about football.")
print(doc.cats)
#返回名词块列表
doc = nlp("A phrase with another phrase occurs.")
chunks = list(doc.noun_chunks)
assert len(chunks) == 2
assert chunks[0].text == "A phrase"
assert chunks[1].text == "another phrase"
#返回句子列表
doc = nlp("This is a sentence. Here's another...")
sents = list(doc.sents)
assert len(sents) == 2
assert [s.root.text for s in sents] == ["is", "'s"]
#句子向量
doc = nlp("I like apples")
assert doc.vector.dtype == "float32"
assert doc.vector.shape == (300,)
#是否有句子向量
doc = nlp("I like apples")
assert doc.has_vector
#得到句子向量的L2范式
doc1 = nlp("I like apples")
doc2 = nlp("I like oranges")
doc1.vector_norm  # 4.54232424414368
doc2.vector_norm  # 3.304373298575751
assert doc1.vector_norm != doc2.vector_norm
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

小怪兽会微笑

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值