python做文本挖掘_python文本挖掘模版

import xlrd

import jieba

import sys

import importlib

import os #python内置的包,用于进行文件目录操作,我们将会用到os.listdir函数

import pickle #导入cPickle包并且取一个别名pickle #持久化类

import random

import numpy as np

import matplotlib.pyplot as plt

from mpl_toolkits.mplot3d import Axes3D

from pylab import mpl

from sklearn.naive_bayes import MultinomialNB # 导入多项式贝叶斯算法包

from sklearn import svm

from sklearn import metrics

from sklearn.datasets.base import Bunch

from sklearn.feature_extraction.text import TfidfVectorizer

importlib.reload(sys)

#把内容和类别转化成一个向量的形式

trainContentdatasave=[] #存储所有训练和测试数据的分词

testContentdatasave=[]

trainContentdata = []

testContentdata = []

trainlabeldata = []

testlabeldata = []

#导入文本描述的训练和测试数据

def importTrainContentdata():

file = '20180716_train.xls'

wb = xlrd.open_workbook(file)

ws = wb.sheet_by_name("Sheet1")

for r in range(ws.nrows):

col = []

for c in range(1):

col.append(ws.cell(r, c).value)

trainContentdata.append(col)

def importTestContentdata():

file = '20180716_test.xls'

wb = xlrd.open_workbook(file)

ws = wb.sheet_by_name("Sheet1")

for r in range(ws.nrows):

col = []

for c in range(1):

col.append(ws.cell(r, c).value)

testContentdata.append(col)

#导入类别的训练和测试数据

def importTrainlabeldata():

file = '20180716_train_label.xls'

wb = xlrd.open_workbook(file)

ws = wb.sheet_by_name("Sheet1")

for r in range(ws.nrows):

col = []

for c in range(1):

col.append(ws.cell(r, c).value)

trainlabeldata.append(col)

def importTestlabeldata():

file = '20180716_test_label.xls'

wb = xlrd.open_workbook(file)

ws = wb.sheet_by_name("Sheet1")

for r in range(ws.nrows):

col = []

for c in range(1):

col.append(ws.cell(r, c).value)

testlabeldata.append(col)

"""

def importClassSet():

file = 'ClassSet.xls'

wb = xlrd.open_workbook(file)

ws = wb.sheet_by_name("Sheet1")

for r in range(ws.nrows):

col = []

for c in range(ws.ncols):

col.append(ws.cell(r, c).value)

ClassSet.append(col)

"""

def buildtrainbunch(bunch_path):

bunch = Bunch(label=[],contents=[])

for item1 in trainlabeldata:

bunch.label.append(item1)

for item2 in trainContentdata:

item2=str(item2)

item2 = item2.replace("\r\n", "")

item2 = item2.replace(" ", "")

content_seg=jieba.cut(item2)

save2=''

for item3 in content_seg:

if len(item3) > 1 and item3!='\r\n':

trainContentdatasave.append(item3)

save2=save2+","+item3

bunch.contents.append(save2)

with open(bunch_path, "wb") as file_obj:

pickle.dump(bunch, file_obj)

print("构建训练数据文本对象结束!!!")

def buildtestbunch(bunch_path):

bunch = Bunch(label=[],contents=[])

for item1 in testlabeldata:

bunch.label.append(item1)

for item2 in testContentdata:

item2=str(item2)

item2 = item2.replace("\r\n", "")

item2 = item2.replace(" ", "")

content_seg=jieba.cut(item2)

save2=''

for item3 in content_seg:

if len(item3) > 1 and item3!='\r\n':

testContentdatasave.append(item3)

save2=save2+","+item3

bunch.contents.append(save2)

with open(bunch_path, "wb") as file_obj:

pickle.dump(bunch, file_obj)

print("构建测试数据文本对象结束!!!")

#读取停用词

def _readfile(path):

with open(path, "rb") as fp:

content = fp.read()

return content

# 读取bunch对象

def _readbunchobj(path):

with open(path, "rb") as file_obj:

bunch = pickle.load(file_obj)

return bunch

# 写入bunch对象

def _writebunchobj(path, bunchobj):

with open(path, "wb") as file_obj:

pickle.dump(bunchobj, file_obj)

def vector_space(stopword_path,bunch_path,space_path):

stpwrdlst = _readfile(stopword_path).splitlines()#读取停用词

bunch = _readbunchobj(bunch_path)#导入分词后的词向量bunch对象

#构建tf-idf词向量空间对象

tfidfspace = Bunch(label=bunch.label,tdm=[], vocabulary={})

'''

权重矩阵tdm,其中,权重矩阵是一个二维矩阵,tdm[i][j]表示,第j个词(即词典中的序号)在第i个类别中的IF-IDF值

'''

#使用TfidVectorizer初始化向量空间模型

vectorizer = TfidfVectorizer(stop_words=stpwrdlst, sublinear_tf=True, max_df=0.5, min_df=0.0001,use_idf=False,max_features=10000)

#print(vectorizer)

#文本转为词频矩阵,单独保存字典文件

tfidfspace.tdm = vectorizer.fit_transform(bunch.contents)

tfidfspace.vocabulary = vectorizer.vocabulary_

#创建词袋的持久化

_writebunchobj(space_path, tfidfspace)

print("if-idf词向量空间实例创建成功!!!")

def testvector_space(stopword_path,bunch_path,space_path,train_tfidf_path):

stpwrdlst = _readfile(stopword_path).splitlines()#把停用词变成列表

bunch = _readbunchobj(bunch_path)

tfidfspace = Bunch(label=bunch.label,tdm=[], vocabulary={})

'''

tdm存放的是计算后得到的TF-IDF权重矩阵.

vocabulary是词向量空间的索引,例如,如果我们定义的词向量空间是(我,喜欢,相国大人),那么vocabulary就是这样一个索引字典

vocabulary={"我":0,"喜欢":1,"相国大人":2},你可以简单的理解为:vocabulary就是词向量空间的坐标轴,索引值相当于表明了第几个维度。

'''

#导入训练集的TF-IDF词向量空间 ★★

trainbunch = _readbunchobj(train_tfidf_path)

tfidfspace.vocabulary = trainbunch.vocabulary

'''

关于参数,你只需要了解这么几个就可以了:

stop_words:

传入停用词,以后我们获得vocabulary_的时候,就会根据文本信息去掉停用词得到

vocabulary:

之前说过,不再解释。

sublinear_tf:

计算tf值采用亚线性策略。比如,我们以前算tf是词频,现在用1+log(tf)来充当词频。

smooth_idf:

计算idf的时候log(分子/分母)分母有可能是0,smooth_idf会采用log(分子/(1+分母))的方式解决。默认已经开启,无需关心。

norm:

归一化,我们计算TF-IDF的时候,是用TF*IDF,TF可以是归一化的,也可以是没有归一化的,一般都是采用归一化的方法,默认开启.

max_df:

有些词,他们的文档频率太高了(一个词如果每篇文档都出现,那还有必要用它来区分文本类别吗?当然不用了呀),所以,我们可以

设定一个阈值,比如float类型0.5(取值范围[0.0,1.0]),表示这个词如果在整个数据集中超过50%的文本都出现了,那么我们也把它列

为临时停用词。当然你也可以设定为int型,例如max_df=10,表示这个词如果在整个数据集中超过10的文本都出现了,那么我们也把它列

为临时停用词。

min_df:

与max_df相反,虽然文档频率越低,似乎越能区分文本,可是如果太低,例如10000篇文本中只有1篇文本出现过这个词,仅仅因为这1篇

文本,就增加了词向量空间的维度,太不划算。

当然,max_df和min_df在给定vocabulary参数时,就失效了。

'''

vectorizer = TfidfVectorizer(stop_words=stpwrdlst, sublinear_tf=True, max_df=0.7, vocabulary=trainbunch.vocabulary, min_df=0.001)

#print(vectorizer)

tfidfspace.tdm = vectorizer.fit_transform(bunch.contents)

_writebunchobj(space_path, tfidfspace)

print("if-idf词向量空间实例创建成功!!!")

def metrics_result(actual, predict): # metrics.f1_score(y_test, y_pred, average='weighted', labels=np.unique(y_pred))

print('精度:{0:.3f}'.format(metrics.precision_score(actual, predict,average='weighted', labels=np.unique(predict))))

print('召回:{0:0.3f}'.format(metrics.recall_score(actual, predict,average='weighted', labels=np.unique(predict))))

print('f1-score:{0:.3f}'.format(metrics.f1_score(actual, predict, average='weighted', labels=np.unique(predict))))

#准确率和召回率是相互影响的,理想情况下是二者都高,但是一般情况下准确率高,召回率就低;召回率高,准确率就低

if __name__=="__main__":

importTrainContentdata()

importTestContentdata()

importTrainlabeldata()

importTestlabeldata()

#导入分词后的词向量bunch对象

train_bunch_path ="F:/goverment/ArticleMining/trainbunch.bat"#Bunch保存路径

test_bunch_path ="F:/goverment/ArticleMining/testbunch.bat"

stopword_path ="F:/goverment/ArticleMining/hlt_stop_words.txt"

train_space_path = "F:/goverment/ArticleMining/traintfdifspace.dat"

test_space_path = "F:/goverment/ArticleMining/testtfdifspace.dat"

#对训练和测试集进行bunch操作

buildtrainbunch(train_bunch_path)

buildtestbunch(test_bunch_path)

vector_space(stopword_path,train_bunch_path,train_space_path)

testvector_space(stopword_path,test_bunch_path,test_space_path,train_space_path)

#导入训练和测试数据集

train_set=_readbunchobj(train_space_path)

test_set=_readbunchobj(test_space_path)

print(train_set.tdm)

'''

mm=0

ii=0

jj=0

for i in range(3142):

for j in range(3142):

if train_set.tdm[i][j] >mm:

mm=train_set.tdm[i][j]

ii=i

jj=j

print(ii)

print(jj)

'''

#test_set.tdm

#train_set.label

# 训练分类器:输入词袋向量和分类标签,alpha:0.001 alpha越小,迭代次数越多,精度越高

#低召回、F1: 0.75 rbf:0.59 0.8 rbf 0.578

#c0.75 poly 66.5 精度:0.665 gamma=10 召回:0.330 f1-score:0.416

#C=0.7, kernel='poly', gamma=10 召回:0.331 f1-score:0.417

# alpha:0.001 alpha 越小,迭代次数越多,精度越高

'''

clf = MultinomialNB(alpha=0.052).fit(train_set.tdm, train_set.label)

#clf = svm.SVC(C=0.7, kernel='poly', gamma=10, decision_function_shape='ovr')

clf.fit(train_set.tdm, train_set.label)

predicted=clf.predict(test_set.tdm)

tv = TfidfVectorizer()

train_data = tv.fit_transform(X_train)

test_data = tv.transform(X_test)

lr = LogisticRegression(C=3)

lr.fit(train_set.tdm, train_set.label)

predicted=lr.predict(test_set.tdm)

print(lr.score(test_set.tdm, test_set.label))

#print(test_set.tdm)

'''

clf = SVC(C=1500)

clf.fit(train_set.tdm, train_set.label)

predicted=clf.predict(test_set.tdm)

print(clf.score(test_set.tdm, test_set.label))

'''

from sklearn.neighbors import KNeighborsClassifier

knnclf = KNeighborsClassifier(n_neighbors=9)#default with k=5

knnclf.fit(train_set.tdm,train_set.label)

predicted = knnclf.predict(test_set.tdm)

'''

a=[]

b=[]

for i in range(len(predicted)):

b.append((int)(float(predicted[i])))

a.append(int(test_set.label[i][0]))

f=open('F:/goverment/ArticleMining/predict.txt', 'w')

for i in range(len(predicted)):

f.write(str(b[i]))

f.write('\n')

f.write("写好了")

f.close()

#for i in range(len(predicted)):

#print(b[i])

metrics_result(a, b)

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值