导入词向量模型:单独拉出来,因为模型加载很慢!
# -*- coding:UTF-8 -*-
from sklearn import svm #SVM导入
import codecs
from __future__ import division #除法
#词向量导入
import sys
import codecs #可以以特定编码打开文件
import jieba
import jieba.posseg as pseg
reload(sys) #zzh说这种方法不好,不要再用了!!! 可是真的很好用啊 QAQ
sys.setdefaultencoding('utf-8')
import gensim
# model = gensim.models.Word2Vec.load("22620491.model")
model = gensim.models.KeyedVectors.load_word2vec_format('news_12g_baidubaike_20g_novel_90g_embedding_64.bin', binary=True)
word_vec = model.wv
del model #把模型给word_vec,所以Model删掉。
提取关键词
ekey=[] #165个关键词
keywords=codecs.open("keywords33*5.txt","r","utf-8-sig")
lines=keywords.readlines()
for line in lines:
words=line.split(" ",32)
for word in words:
print word
word=word.replace("\r\n","")
ekey.append(word) #word_vec输入必须要unicode才行。
print(ekey)
# for i in ekey:
# word_vec[i]
# print(word_vec[ekey[0]])
# print("end")
# print(len(ekey))
keywords.close()
生成训练集和测试集:要改两次输入分别输出哦!!!
smlrt_svm=codecs.open("similarity_svm_csj.txt","w","utf-8-sig") #训练集测试集,要改输入文本,运行两边,输出xljcsj
sentence=codecs.open("csj_fenci.txt","r","utf-8-sig")
lines=sentence.readlines()
for line in lines:
stc=[]
e=[] #向量
if line.split(" ",1)[1]!="\r\n": #除掉空行
words=line.split(" ",1)[1].split(" ")
for word in words:
word=word.replace("\r\n","")
stc.append(word)
print(stc)
for key in ekey:
maxs=-1
for i in stc:
try:
s=word_vec.similarity(key, i)
if s>maxs:
maxs=s
# print(s)
except:
continue
if maxs==-1:
maxs=0
e.append(maxs)
else:
e.append(maxs)
print(e)
smlrt_svm.write(line.split(" ",1)[0].split(" ",1)[0]+" "+str(e)+"\r\n")
smlrt_svm.close()
平衡数据集:也要改两次输入分别输出哦!!!
import codecs# 也要写两遍 xljcsj,还要调节平衡倍数
xlj = codecs.open("similarity_svm_csj.txt", 'r', 'utf-8') # codecs包指定TXT打开方式
# csj = codecs.open("csj_no0.txt", 'r', 'utf-8') # codecs包指定TXT打开方式
xllines = xlj.readlines()
# cslines = csj.readlines()
xlj_blc = codecs.open('s_s_csj_blc165.txt','w','utf-8')
# csj_blc = codecs.open('csj_blc.txt','w','utf-8')
le,ai,nu,jing,wu=0,0,0,0,0
for line in xllines:
xlj_blc.write(line)
if line.split(" ",1)[0]=="1":
xlj_blc.write(line)
ai+=2
elif line.split(" ",1)[0]=="2":
nu+=1
for i in range(7):#csj7 xlj6
nu += 1
xlj_blc.write(line)
elif line.split(" ",1)[0]=="3":
xlj_blc.write(line)
jing+=2
elif line.split(" ",1)[0]=="4":
wu+=1
for i in range(5): #csj5 xlj9
wu+=1
xlj_blc.write(line)
elif line.split(" ",1)[0]=="0":
le+=1
print(le,ai,nu,jing,wu)
输入SVM进行多分类,输出各个分类准确率:
有可能报错 什么什么10什么的,意思是有空白,这时候try一下,检查哪行有问题,手动删掉就好了!
xlj = codecs.open("s_s_xlj_blc165.txt","r","utf-8-sig")
# doc = open("res.txt","w")
lines = xlj.readlines()
x=[]
y=[]
zero=[]
for i in range(0,165):
zero.append(0)
for line in lines:
a=line.split(" ",1)[1].replace("[","").replace("]","").split(",")
a = map(eval, a)
if a!= zero: #去掉没有词向量的特征向量
x.append(a)
try:
b = int(line.split(" ", 1)[0])
y.append(b)
except:
print lines.index(line)
print line
xlj.close()
clf = svm.SVC(decision_function_shape='ovo')
clf.fit(x, y)
# print (len(x))
# print (len(y))
# print (len(a))
x_test=[]
y_test=[]
csj = codecs.open("s_s_csj_blc165.txt","r","utf-8-sig")
lines = csj.readlines()
print(len(lines))
for line in lines:
a=line.split(" ",1)[1].replace("[","").replace("]","").split(",")
a = map(eval, a)
if a!= zero: #去掉没有词向量的特征向量
x_test.append(a)
b = int(line.split(" ", 1)[0])
y_test.append(b)
print len(x_test)
print len(y_test)
print x_test
# y_hat = clf.predict(x)
print clf.score(x_test, y_test) #全部分类准确率
print clf.score(x, y)
le,ai,nu,jing,wu=0,0,0,0,0 #五类分类准确率
sumle,sumai,sumnu,sumjing,sumwu=0,0,0,0,0
for i in range(len(x_test)):
if y_test[i]==0:
sumle+=1
if clf.predict([x_test[i]])==y_test[i] :
le=le+1
elif y_test[i]==1:
sumai+=1
if clf.predict([x_test[i]])==y_test[i] :
ai+=1
elif y_test[i]==2:
sumnu+=1
if clf.predict([x_test[i]])==y_test[i] :
nu+=1
elif y_test[i]==3:
sumjing+=1
if clf.predict([x_test[i]])==y_test[i] :
jing+=1
elif y_test[i]==4:
sumwu+=1
if clf.predict([x_test[i]])==y_test[i] :
wu+=1
print(le/sumle,ai/sumai,nu/sumnu,jing/sumjing,wu/sumwu)
print(le,ai,nu,jing,wu)
print(sumle,sumai,sumnu,sumjing,sumwu)
csj.close()
# for line in lines:
# X=line.split(" ",1)[1].replace("[","").replace("]","").split(",")
# X = map(eval, X)
# res=clf.predict([X])
# # print(clf.predict([X])) # 预测
# if clf.predict([X]).tolist()[0] == int(line.split(" ",1)[0]):
# cnt=cnt+1
# print(line.split(" ",1)[0])
# print cnt