python统计词频瓦尔登湖_python文本挖掘输出权重，词频等信息，画出3d权重图

最新推荐文章于 2024-03-09 22:50:12 发布

影小白养成记

最新推荐文章于 2024-03-09 22:50:12 发布

阅读量251

点赞数

文章标签： python统计词频瓦尔登湖

本文链接：https://blog.csdn.net/weixin_34677764/article/details/112884266

版权

该博客主要展示了如何使用Python进行文本挖掘，通过TfidfVectorizer处理瓦尔登湖的文本数据，计算词频，并结合jieba分词进行内容分析。最终，博客还讲解了如何画出3D权重图，以可视化词频信息。

摘要由CSDN通过智能技术生成

#-*- coding: utf-8 -*-

from pandas importread_csvimportnumpy as npfrom sklearn.datasets.base importBunchimport pickle #导入cPickle包并且取一个别名pickle #持久化类

from sklearn.feature_extraction.text importTfidfVectorizerimportjiebaimport operator #排序用

from sklearn importmetricsfrom sklearn.externals importjoblibimportxlwtdefimportSmallContentdata(file, data, art, label, f, Slast, Snew):

dataset=read_csv(file)

Sdata=dataset.values[:, :]print(type(Sdata))if f == 1:for line inSdata:

ls=[]

ls.append(line[14])

ls.append(line[15])

ls.append(line[16])

ls.append(line[17])

Slast.append(ls)#print(len(Slast))

#print("需要对照的小类数据准备完毕")

'''找到smalli不为0的装入Straindata,把数据分开'''

for smalli in range(14, 18):#print(smalli)

count =0for line inSdata:

count= count + 1

if line[smalli] != '0' and line[smalli] !=0:

k= 1ls=[]for i inline:if k == 1:#art.append(i)

k = k + 1

continue

if k == 11: #k为14并不代表是line[14]，因为line是从0开始

breakls.append(float(i))

k= k + 1data.append(ls)

label.append(line[smalli])if f == 1:

Snew.append(count)for line inSdata:

art.append(line[0])#print("为什么都超限",len(Snew))

defgetKvector(train_set, vec, n):classobj:def __init__(self):

self.key=0

self.weight= 0.0nonzero=train_set.tdm.nonzero()

k=0

lis=[]

gather=[]

p= -1

for i innonzero[0]:

p= p + 1

if k ==i:

a=obj()

a.key= nonzero[1][p]

a.weight= train_set.tdm[i, nonzero[1][p]]

lis.append(a)else:

lis.sort(key=lambda obj: obj.weight, reverse=True) #对链表内为类对象的排序

gather.append(lis)while k

k= k + 1lis=[]

a=obj()

a.key= nonzero[1][p]

a.weight= train_set.tdm[i, nonzero[1][p]]

lis.append(a)

gather.append(lis)#gather存储的是每条数据的事实描述的特征向量，已经从小到大排好了，只不过每个存既有key又有weight

#我们只要key，不再需要weight

sj= 1

for i ingather:

ls=[]for j ini:

sj= sj + 1ls.append(float(j.key))while sj <=n:

sj= sj + 1ls.append(-1)

sj= 1vec.append(ls)'''读取停用词'''

def_readfile(path):

with open(path,"rb") as fp:

content=fp.read()returncontent'''读取bunch对象'''

def_readbunchobj(path):

with open(path,"rb") as file_obj:

bunch=pickle.load(file_obj)returnbunch'''写入bunch对象'''

def_writebunchobj(path, bunchobj):

with open(path,"wb") as file_obj:

pickle.dump(bunchobj, file_obj)defbuildtrainbunch(bunch_path, art_train, trainlabel):

bunch= Bunch(label=[], contents=[])for item1 intrainlabel:

bunch.label.append(item1)#trainContentdatasave=[] #存储所有训练和测试数据的分词

for item2 inart_train:

item2=str(item2)

item2= item2.replace("\r\n", "")

item2= item2.replace(" ", "")

content_seg=jieba.cut(item2)

save2= ''

for item3 incontent_seg:if len(item3) > 1 and item3 != '\r\n':#trainContentdatasave.append(item3)

save2 = save2 + "," +item3

bunch.contents.append(save2)

with open(bunch_path,"wb") as file_obj:

pickle.dump(bunch, file_obj)print("构建训练数据文本对象结束！！！")defbuildtestbunch(bunch_path, art_test, testlabel):

bunch= Bunch(label=[], contents=[])for item1 intestlabel:

bunch.label.append(item1)#testContentdatasave=[] #存储所有训练和测试数据的分词

for item2 inart_test:

item2=str(item2)

item2= item2.replace("\r\n", "")

item2= item2.replace(" ", "")

content_seg=jieba.cut(item2)

save2= ''

for item3 incontent_seg:if len(item3) > 1 and item3 != '\r\n':#testContentdatasave.append(item3)

save2 = save2 + "," +item3

bunch.contents.append(save2)

with open(bunch_path,"wb") as file_obj:

pickle.dump(bunch, file_obj)print("构建测试数据文本对象结束！！！")defvector_space(stopword_path, bunch_path, space_path):

stpwrdlst= _readfile(stopword_path).splitlines() #读取停用词

bunch = _readbunchobj(bunch_path) #导入分词后的词向量bunch对象

#构建tf-idf词向量空间对象

tfidfspace = Bunch(label=bunch.label, tdm=[], vocabulary={})#权重矩阵tdm，其中，权重矩阵是一个二维矩阵，tdm[i][j]表示，第j个词(即词典中的序号)在第i个类别中的IF-IDF值

#使用TfidVectorizer初始化向量空间模型

vectorizer = TfidfVectorizer(stop_words=stpwrdlst, sublinear_tf=True, max_df=0.5, min_df=0.0001, use_idf=True,

max_features=15000)#print(vectorizer)

#文本转为词频矩阵，单独保存字典文件

tfidfspace.tdm =vectorizer.fit_transform(bunch.contents)

tfidfspace.vocabulary=vectorizer.vocabulary_#创建词袋的持久化

_writebunchobj(space_path, tfidfspace)print("if-idf词向量空间实例创建成功！！！")deftestvector_space(stopword_path, bunch_path, space_path, train_tfidf_path):

stpwrdlst= _readfile(stopword_path).splitlines() #把停用词变成列表

bunch =_readbunchobj(bunch_path)

tfidfspace= Bunch(label=bunch.label, tdm=[], vocabulary={})#导入训练集的TF-IDF词向量空间 ★★

trainbunch =_readbunchobj(train_tfidf_path)

tfidfspace.vocabulary=trainbunch.vocabulary

vectorizer= TfidfVectorizer(stop_words=stpwrdlst, sublinear_tf=True, max_df=0.7, vocabulary=trainbunch.vocabulary,

min_df=0.001)

tfidfspace.tdm=vectorizer.fit_transform(bunch.contents)

_writebunchobj(space_path, tfidfspace)print("if-idf词向量空间实例创建成功！！！")

影小白养成记

关注

0
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
复制链接

分享到 QQ

分享到新浪微博

扫一扫