Python自然语言处理 Chapter 1

最新推荐文章于 2024-07-10 08:41:33 发布

weixin_30567471

最新推荐文章于 2024-07-10 08:41:33 发布

阅读量89

点赞数

文章标签：人工智能 python

原文链接：http://www.cnblogs.com/AryaStark/p/8244751.html

版权

from __future__ import division
import nltk
nltk.download()
from nltk.book import *

#搜索文本
text1.concordance("monstrous")
#出现在相似上下文中德词汇
text1.similar("monstrous")
#两个或两个以上的词共同的上下文
text2.common_contexts(["monstrous","very"])
import matplotlib
#离散图判断词汇在文本中的位置，从文本开头算起在它前面有多少词
text4.dispersion_plot(["citizens","democracy","freedom","duties","American"])
#产生和该文本风格相近的段落
text3.generate()
#返回所有标识符的个数
len(text3)
#为每个标示符计数,set表示集合，集合中元素只出现一次
sorted(set(text3))
len(set(text3))
len(text3)/len(set(text3))
#计算一个词在文本中出现次数，占据的百分比
text3.count("smote")
100*text4.count("a")/len(text4)

fdist1=FreqDist(text1)#计算text1中的词频
vocabulary=fdist1.keys()#关键字信息
fdist1['whale']#‘whale’词出现的频率
fdist1.plot(50,cumulative=True)#词频前50的词汇进行绘图
V = set(text1)#text1 输出词汇集合中词长超过15的词汇
long_words=[w for w in V if len(w) > 15]

text4.collocations()#搭配频繁出现的双连词
[len(w) for w in text1]#text1中每个词的词长
fdist=FreqDist([len(w) for w in text1])#每个词长对应出现的频率
fdist#词长只有20种
fdist.max()#出现频率最高的词长
fdist.freq(3)#给定样本的频率，占全部词汇的百分比

转载于:https://www.cnblogs.com/AryaStark/p/8244751.html

weixin_30567471

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
Python自然语言处理 Chapter 1

from __future__ import divisionimport nltknltk.download()from nltk.book import *#搜索文本text1.concordance("monstrous")#出现在相似上下文中德词汇text1.similar("monstrous")#两个或两个以上的词共同的上下文tex...
复制链接

扫一扫