spacy的每个句子doc的属性和方法

最新推荐文章于 2025-04-24 21:48:08 发布

小怪兽会微笑

最新推荐文章于 2025-04-24 21:48:08 发布

阅读量529

点赞数

分类专栏：学习笔记文章标签： python 人工智能

本文链接：https://blog.csdn.net/a1920993165/article/details/127893102

版权

学习笔记专栏收录该内容

140 篇文章

订阅专栏

#属性
text	A string representation of the document text.
str

text_with_ws	An alias of Doc.text, provided for duck-type compatibility with Span and Token.
str

mem	The document’s local memory heap, for all C data it owns.
cymem.Pool

vocab	The store of lexical types.
Vocab

tensor	Container for dense vector representations.
numpy.ndarray

user_data	A generic storage area, for user custom data.
Dict[str, Any]

lang	Language of the document’s vocabulary.
int

lang_	Language of the document’s vocabulary.
str

sentiment	The document’s positivity/negativity score, if available.
float

user_hooks	A dictionary that allows customization of the Doc’s properties.
Dict[str, Callable]

user_token_hooks	A dictionary that allows customization of properties of Token children.
Dict[str, Callable]

user_span_hooks	A dictionary that allows customization of properties of Span children.
Dict[str, Callable]

has_unknown_spaces	Whether the document was constructed without known spacing between tokens (typically when created from gold tokenization).
bool

_	User space for adding custom attribute extensions.
Underscore

#两种构建方法
# Construction 1
doc = nlp("Some text")

# Construction 2
from spacy.tokens import Doc

words = ["hello", "world", "!"]
spaces = [True, False, False]
doc = Doc(nlp.vocab, words=words, spaces=spaces)

#得到span或token的方法
doc = nlp("Give it back! He pleaded.")
assert doc[0].text == "Give"
assert doc[-1].text == "."
span = doc[1:3]
assert span.text == "it back"

#迭代器
doc = nlp("Give it back")
assert [t.text for t in doc] == ["Give", "it", "back"]

#长度
doc = nlp("Give it back! He pleaded.")
assert len(doc) == 7

#设置扩展属性或方法
from spacy.tokens import Doc
city_getter = lambda doc: any(city in doc.text for city in ("New York", "Paris", "Berlin"))
Doc.set_extension("has_city", getter=city_getter)
doc = nlp("I like New York")
assert doc._.has_city

#获得扩展东西
from spacy.tokens import Doc
Doc.set_extension("has_city", default=False)
extension = Doc.get_extension("has_city")
assert extension == (False, None, None, None)

#判断是否有扩展属性
from spacy.tokens import Doc
Doc.set_extension("has_city", default=False)
assert Doc.has_extension("has_city")

#删除扩展属性
from spacy.tokens import Doc
Doc.set_extension("has_city", default=False)
removed = Doc.remove_extension("has_city")
assert not Doc.has_extension("has_city")

#获得字符切片span
doc = nlp("I like New York")
span = doc.char_span(7, 15, label="GPE")
assert span.text == "New York"

#设置实体名字，给几个词上标签
from spacy.tokens import Span
doc = nlp("Mr. Best flew to New York on Saturday morning.")
doc.set_ents([Span(doc, 0, 2, "PERSON")])
ents = list(doc.ents)
assert ents[0].label_ == "PERSON"
assert ents[0].text == "Mr. Best"

#判断两个句子相似度
apples = nlp("I like apples")
oranges = nlp("I like oranges")
apples_oranges = apples.similarity(oranges)
oranges_apples = oranges.similarity(apples)
assert apples_oranges == oranges_apples

#统计词出现的次数
from spacy.attrs import ORTH
doc = nlp("apple apple orange banana")
assert doc.count_by(ORTH) == {7024: 1, 119552: 1, 2087: 2}
doc.to_array([ORTH])
# array([[11880], [11880], [7561], [12800]])

doc = nlp("This is a test")
matrix = doc.get_lca_matrix()
# array([[0, 1, 1, 1], [1, 1, 1, 1], [1, 1, 2, 3], [1, 1, 3, 3]], dtype=int32)

#是否有特殊标记
doc = nlp("This is a text")
assert doc.has_annotation("DEP")

#迅速转化信息为numpy格式
from spacy.attrs import LOWER, POS, ENT_TYPE, IS_ALPHA
doc = nlp(text)
# All strings mapped to integers, for easy export to numpy
np_array = doc.to_array([LOWER, POS, ENT_TYPE, IS_ALPHA])
np_array = doc.to_array("POS")

#从numpy转化为doc
from spacy.attrs import LOWER, POS, ENT_TYPE, IS_ALPHA
from spacy.tokens import Doc
doc = nlp("Hello world!")
np_array = doc.to_array([LOWER, POS, ENT_TYPE, IS_ALPHA])
doc2 = Doc(doc.vocab, words=[t.text for t in doc])
doc2.from_array([LOWER, POS, ENT_TYPE, IS_ALPHA], np_array)
assert doc[0].pos_ == doc2[0].pos_

#多个句子docs
from spacy.tokens import Doc
texts = ["London is the capital of the United Kingdom.",
         "The River Thames flows through London.",
         "The famous Tower Bridge crosses the River Thames."]
docs = list(nlp.pipe(texts))
c_doc = Doc.from_docs(docs)
assert str(c_doc) == " ".join(texts)
assert len(list(c_doc.sents)) == len(docs)
assert [str(ent) for ent in c_doc.ents] ==        [str(ent) for doc in docs for ent in doc.ents]

#保存doc
doc.to_disk("/path/to/doc")

#从文件中得到doc
from spacy.tokens import Doc
from spacy.vocab import Vocab
doc = Doc(Vocab()).from_disk("/path/to/doc")

#变成字节流
doc = nlp("Give it back! He pleaded.")
doc_bytes = doc.to_bytes()

#从字节流变回
from spacy.tokens import Doc
doc = nlp("Give it back! He pleaded.")
doc_bytes = doc.to_bytes()
doc2 = Doc(doc.vocab).from_bytes(doc_bytes)
assert doc.text == doc2.text

#转化为json格式
doc = nlp("All we have to decide is what to do with the time that is given us.")
assert doc.to_json()["text"] == doc.text

#json格式转化为doc
from spacy.tokens import Doc
doc = nlp("All we have to decide is what to do with the time that is given us.")
doc_json = doc.to_json()
deserialized_doc = Doc(nlp.vocab).from_json(doc_json)
assert deserialized_doc.text == doc.text == doc_json["text"]

#没看懂，好心人看懂了欢迎留言评论
doc = nlp("Hello world!")
with doc.retokenize() as retokenizer:
    retokenizer.merge(doc[0:2])
    
doc = nlp("I like David Bowie")
with doc.retokenize() as retokenizer:
    attrs = {"LEMMA": "David Bowie"}
    retokenizer.merge(doc[2:4], attrs=attrs)
    
doc = nlp("I live in NewYork")
with doc.retokenize() as retokenizer:
    heads = [(doc[3], 1), doc[2]]
    attrs = {"POS": ["PROPN", "PROPN"],
             "DEP": ["pobj", "compound"]}
    retokenizer.split(doc[3], ["New", "York"], heads=heads, attrs=attrs)

#得到实体标签
doc = nlp("Mr. Best flew to New York on Saturday morning.")
ents = list(doc.ents)
assert ents[0].label_ == "PERSON"
assert ents[0].text == "Mr. Best"

#得到span
doc = nlp("Their goi ng home")
doc.spans["errors"] = [doc[0:1], doc[1:3]]

#返回一个记录每个词的得分的字典
doc = nlp("This is a text about football.")
print(doc.cats)

#返回名词块列表
doc = nlp("A phrase with another phrase occurs.")
chunks = list(doc.noun_chunks)
assert len(chunks) == 2
assert chunks[0].text == "A phrase"
assert chunks[1].text == "another phrase"

#返回句子列表
doc = nlp("This is a sentence. Here's another...")
sents = list(doc.sents)
assert len(sents) == 2
assert [s.root.text for s in sents] == ["is", "'s"]

#句子向量
doc = nlp("I like apples")
assert doc.vector.dtype == "float32"
assert doc.vector.shape == (300,)

#是否有句子向量
doc = nlp("I like apples")
assert doc.has_vector

#得到句子向量的L2范式
doc1 = nlp("I like apples")
doc2 = nlp("I like oranges")
doc1.vector_norm  # 4.54232424414368
doc2.vector_norm  # 3.304373298575751
assert doc1.vector_norm != doc2.vector_norm