NLTK 标记文本和WordNet的基础代码示例

1将文本标记成句子

from nltk.tokenize import sent_tokenize
import nltk.data
from nltk.tokenize import word_tokenize
from nltk.tokenize import RegexpTokenizer
from nltk.tokenize import PunktSentenceTokenizer
from nltk.corpus import webtext
from nltk.corpus import wordnet
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures
from nltk.corpus import stopwords


# nltk.download('webtext')
# 标记句子
para = "Hello World. It's good to see you. Thanks for buying thisbook."
sent_tokenizes = sent_tokenize(para)
print(sent_tokenizes)

# tokenizer = nltk.data.load('tokenizers/punkt/PY3/english.pickle')
# print(tokenizer.tokenize(para))
# 标记单词
word_tokenize('Hello World.')

# 标记缩写单词
tokenizer = RegexpTokenizer("[\\w']+")
print(tokenizer.tokenize("Can't is a contraction."))

#  简单的空格标记器
tokenizer = RegexpTokenizer('\\s+', gaps=True)
print(tokenizer.tokenize("Can't is a contraction."))

text = webtext.raw('overheard.txt')
sent_tokenizer = PunktSentenceTokenizer(text)
print(sent_tokenizer)

sents1 = sent_tokenizer.tokenize(text)
print(sents1[0])
# sents2 = sent_tokenize(text)
# print(sents2)

# with open('C:\\usr\\share\\nltk_data\\corpora\\webtext\\overheard.txt', encoding='ISO-8859-2') as f:
#    text = f.read()
#    sent_tokenizer = PunktSentenceTokenizer(text)
#    sents = sent_tokenizer.tokenize(text)
#    sents[0]
#    print(sents[0])
#    sents[678]
#    print(sents[678])

# WordNet是英语的词汇数据库
# 查找WordNet中单词的Synset
# 查找cookbook的Synset
syn = wordnet.synsets('cookbook')[0]
print(syn.name())
print(syn.definition())

print(wordnet.synset('cookbook.n.01'))
print(wordnet.synsets('cooking')[0].examples())
# 使用上位词
print(syn.hypernyms())
print(syn.hypernyms()[0].hyponyms())
print(syn.root_hypernyms())
print(syn.hypernym_paths())
# 词性
print(syn.pos())
# WordNet中查找词元(lemma),找到单词的同义词(synonym)
lemmas = syn.lemmas()
len(lemmas)
print(len(lemmas))
print(lemmas[0].name())
print(lemmas[1].name())
print(lemmas[0].synset() == lemmas[1].synset())

print([lemma.name() for lemma in syn.lemmas()])
# 所有可能的同义词
synonyms = []
for syn in wordnet.synsets('book'):
    for lemma in syn.lemmas():
        synonyms.append(lemma.name())
print(len(synonyms))
print(synonyms)
print(len(set(synonyms)))

# 反义词
gn2 = wordnet.synset('good.n.02')
gn2.definition()
evil = gn2.lemmas()[0].antonyms()[0]
evil.name
evil.synset().definition()
ga1 = wordnet.synset('good.a.01')
ga1.definition()
# 反义词。
bad = ga1.lemmas()[0].antonyms()[0]
bad.name()
bad.synset().definition()

# 计算WordNet和Synset的相似度
cb = wordnet.synset('cookbook.n.01')
ib = wordnet.synset('instruction_book.n.01')
cb.wup_similarity(ib)

ref = cb.hypernyms()[0]
cb.shortest_path_distance(ref)
ib.shortest_path_distance(ref)
cb.shortest_path_distance(ib)
# 两个不相似的单词

dog = wordnet.synsets('dog')[0]
print(dog.wup_similarity(cb))
# 动词比价o
cook = wordnet.synset('cook.v.01')
bake = wordnet.synset('bake.v.02')
cook.wup_similarity(bake)
# 路径相似度和LCH相似度

cb.path_similarity(ib)
cb.path_similarity(dog)
cb.lch_similarity(ib)
cb.lch_similarity(dog)
# 发现单词搭配
words = [w.lower() for w in webtext.words('grail.txt')]
bcf = BigramCollocationFinder.from_words(words)
print(bcf.nbest(BigramAssocMeasures.likelihood_ratio, 4))

# 单词过滤器,
stopset = set(stopwords.words('english'))
filter_stops = lambda w: len(w) < 3 or w in stopset
bcf.apply_word_filter(filter_stops)
print(bcf.nbest(BigramAssocMeasures.likelihood_ratio, 4))

 

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值