python统计词频瓦尔登湖_python文本挖掘输出权重,词频等信息,画出3d权重图

该博客主要展示了如何使用Python进行文本挖掘,通过TfidfVectorizer处理瓦尔登湖的文本数据,计算词频,并结合jieba分词进行内容分析。最终,博客还讲解了如何画出3D权重图,以可视化词频信息。
摘要由CSDN通过智能技术生成

#-*- coding: utf-8 -*-

from pandas importread_csvimportnumpy as npfrom sklearn.datasets.base importBunchimport pickle #导入cPickle包并且取一个别名pickle #持久化类

from sklearn.feature_extraction.text importTfidfVectorizerimportjiebaimport operator #排序用

from sklearn importmetricsfrom sklearn.externals importjoblibimportxlwtdefimportSmallContentdata(file, data, art, label, f, Slast, Snew):

dataset=read_csv(file)

Sdata=dataset.values[:, :]print(type(Sdata))if f == 1:for line inSdata:

ls=[]

ls.append(line[14])

ls.append(line[15])

ls.append(line[16])

ls.append(line[17])

Slast.append(ls)#print(len(Slast))

#print("需要对照的小类数据准备完毕")

'''找到smalli不为0的装入Straindata,把数据分开'''

for smalli in range(14, 18):#print(smalli)

count =0for line inSdata:

count= count + 1

if line[smalli] != '0' and line[smalli] !=0:

k= 1ls=[]for i inline:if k == 1:#art.append(i)

k = k + 1

continue

if k == 11: #k为14并不代表是line[14],因为line是从0开始

breakls.append(float(i))

k= k + 1data.append(ls)

label.append(line[smalli])if f == 1:

Snew.append(count)for line inSdata:

art.append(line[0])#print("为什么都超限",len(Snew))

defgetKvector(train_set, vec, n):classobj:def __init__(self):

self.key=0

self.weight= 0.0nonzero=train_set.tdm.nonzero()

k=0

lis=[]

gather=[]

p= -1

for i innonzero[0]:

p= p + 1

if k ==i:

a=obj()

a.key= nonzero[1][p]

a.weight= train_set.tdm[i, nonzero[1][p]]

lis.append(a)else:

lis.sort(key=lambda obj: obj.weight, reverse=True) #对链表内为类对象的排序

gather.append(lis)while k

k= k + 1lis=[]

a=obj()

a.key= nonzero[1][p]

a.weight= train_set.tdm[i, nonzero[1][p]]

lis.append(a)

gather.append(lis)#gather存储的是每条数据的事实描述的特征向量,已经从小到大排好了,只不过每个存既有key又有weight

#我们只要key,不再需要weight

sj= 1

for i ingather:

ls=[]for j ini:

sj= sj + 1ls.append(float(j.key))while sj <=n:

sj= sj + 1ls.append(-1)

sj= 1vec.append(ls)'''读取停用词'''

def_readfile(path):

with open(path,"rb") as fp:

content=fp.read()returncontent'''读取bunch对象'''

def_readbunchobj(path):

with open(path,"rb") as file_obj:

bunch=pickle.load(file_obj)returnbunch'''写入bunch对象'''

def_writebunchobj(path, bunchobj):

with open(path,"wb") as file_obj:

pickle.dump(bunchobj, file_obj)defbuildtrainbunch(bunch_path, art_train, trainlabel):

bunch= Bunch(label=[], contents=[])for item1 intrainlabel:

bunch.label.append(item1)#trainContentdatasave=[] #存储所有训练和测试数据的分词

for item2 inart_train:

item2=str(item2)

item2= item2.replace("\r\n", "")

item2= item2.replace(" ", "")

content_seg=jieba.cut(item2)

save2= ''

for item3 incontent_seg:if len(item3) > 1 and item3 != '\r\n':#trainContentdatasave.append(item3)

save2 = save2 + "," +item3

bunch.contents.append(save2)

with open(bunch_path,"wb") as file_obj:

pickle.dump(bunch, file_obj)print("构建训练数据文本对象结束!!!")defbuildtestbunch(bunch_path, art_test, testlabel):

bunch= Bunch(label=[], contents=[])for item1 intestlabel:

bunch.label.append(item1)#testContentdatasave=[] #存储所有训练和测试数据的分词

for item2 inart_test:

item2=str(item2)

item2= item2.replace("\r\n", "")

item2= item2.replace(" ", "")

content_seg=jieba.cut(item2)

save2= ''

for item3 incontent_seg:if len(item3) > 1 and item3 != '\r\n':#testContentdatasave.append(item3)

save2 = save2 + "," +item3

bunch.contents.append(save2)

with open(bunch_path,"wb") as file_obj:

pickle.dump(bunch, file_obj)print("构建测试数据文本对象结束!!!")defvector_space(stopword_path, bunch_path, space_path):

stpwrdlst= _readfile(stopword_path).splitlines() #读取停用词

bunch = _readbunchobj(bunch_path) #导入分词后的词向量bunch对象

#构建tf-idf词向量空间对象

tfidfspace = Bunch(label=bunch.label, tdm=[], vocabulary={})#权重矩阵tdm,其中,权重矩阵是一个二维矩阵,tdm[i][j]表示,第j个词(即词典中的序号)在第i个类别中的IF-IDF值

#使用TfidVectorizer初始化向量空间模型

vectorizer = TfidfVectorizer(stop_words=stpwrdlst, sublinear_tf=True, max_df=0.5, min_df=0.0001, use_idf=True,

max_features=15000)#print(vectorizer)

#文本转为词频矩阵,单独保存字典文件

tfidfspace.tdm =vectorizer.fit_transform(bunch.contents)

tfidfspace.vocabulary=vectorizer.vocabulary_#创建词袋的持久化

_writebunchobj(space_path, tfidfspace)print("if-idf词向量空间实例创建成功!!!")deftestvector_space(stopword_path, bunch_path, space_path, train_tfidf_path):

stpwrdlst= _readfile(stopword_path).splitlines() #把停用词变成列表

bunch =_readbunchobj(bunch_path)

tfidfspace= Bunch(label=bunch.label, tdm=[], vocabulary={})#导入训练集的TF-IDF词向量空间 ★★

trainbunch =_readbunchobj(train_tfidf_path)

tfidfspace.vocabulary=trainbunch.vocabulary

vectorizer= TfidfVectorizer(stop_words=stpwrdlst, sublinear_tf=True, max_df=0.7, vocabulary=trainbunch.vocabulary,

min_df=0.001)

tfidfspace.tdm=vectorizer.fit_transform(bunch.contents)

_writebunchobj(space_path, tfidfspace)print("if-idf词向量空间实例创建成功!!!")

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值