20181007 增加到33*5=165个关键字的(相似度+SVM方法)代码

导入词向量模型:单独拉出来,因为模型加载很慢!




# -*- coding:UTF-8 -*-
from sklearn import svm  #SVM导入
import codecs
from __future__ import division #除法


#词向量导入
import sys
import codecs   #可以以特定编码打开文件
import jieba
import jieba.posseg as pseg
reload(sys)               #zzh说这种方法不好,不要再用了!!!  可是真的很好用啊 QAQ
sys.setdefaultencoding('utf-8')
import gensim

# model = gensim.models.Word2Vec.load("22620491.model")
model = gensim.models.KeyedVectors.load_word2vec_format('news_12g_baidubaike_20g_novel_90g_embedding_64.bin', binary=True)
word_vec = model.wv
del model     #把模型给word_vec,所以Model删掉。

提取关键词

ekey=[] #165个关键词
keywords=codecs.open("keywords33*5.txt","r","utf-8-sig")
lines=keywords.readlines()
for line in lines:
    words=line.split(" ",32)
    for word in words:
        print word
        word=word.replace("\r\n","")        
        ekey.append(word)    #word_vec输入必须要unicode才行。        
print(ekey)
# for i in ekey: 
#     word_vec[i]
# print(word_vec[ekey[0]])
# print("end")
# print(len(ekey))
keywords.close()

生成训练集和测试集:要改两次输入分别输出哦!!!

smlrt_svm=codecs.open("similarity_svm_csj.txt","w","utf-8-sig")  #训练集测试集,要改输入文本,运行两边,输出xljcsj

sentence=codecs.open("csj_fenci.txt","r","utf-8-sig")
lines=sentence.readlines()
for line in lines:
    stc=[]
    e=[] #向量
    if line.split("  ",1)[1]!="\r\n":  #除掉空行
        words=line.split("  ",1)[1].split(" ")
        for word in words:
            word=word.replace("\r\n","")
            stc.append(word) 
        print(stc)
        
        for key in ekey:
            maxs=-1
            for i in stc:
                try:                   
                    s=word_vec.similarity(key, i)
                    if s>maxs:
                        maxs=s
#                     print(s)
                except:                
                    continue
            if maxs==-1:
                maxs=0
                e.append(maxs)
            else:
                e.append(maxs)
        print(e)
        smlrt_svm.write(line.split("  ",1)[0].split(" ",1)[0]+" "+str(e)+"\r\n")

smlrt_svm.close()

平衡数据集:也要改两次输入分别输出哦!!!

import codecs# 也要写两遍 xljcsj,还要调节平衡倍数

xlj = codecs.open("similarity_svm_csj.txt", 'r', 'utf-8')  # codecs包指定TXT打开方式
# csj = codecs.open("csj_no0.txt", 'r', 'utf-8')  # codecs包指定TXT打开方式
xllines = xlj.readlines()
# cslines = csj.readlines()
xlj_blc = codecs.open('s_s_csj_blc165.txt','w','utf-8')
# csj_blc = codecs.open('csj_blc.txt','w','utf-8')

le,ai,nu,jing,wu=0,0,0,0,0

for line in xllines:
    xlj_blc.write(line)
    if line.split(" ",1)[0]=="1":
        xlj_blc.write(line)
        ai+=2
    elif line.split(" ",1)[0]=="2":
        nu+=1
        for i in range(7):#csj7 xlj6
            nu += 1
            xlj_blc.write(line)
    elif line.split(" ",1)[0]=="3":
        xlj_blc.write(line)
        jing+=2
    elif line.split(" ",1)[0]=="4":
        wu+=1
        for i in range(5): #csj5 xlj9
            wu+=1
            xlj_blc.write(line)
    elif line.split(" ",1)[0]=="0":
        le+=1
print(le,ai,nu,jing,wu)



输入SVM进行多分类,输出各个分类准确率:

有可能报错 什么什么10什么的,意思是有空白,这时候try一下,检查哪行有问题,手动删掉就好了!

xlj = codecs.open("s_s_xlj_blc165.txt","r","utf-8-sig")
# doc = open("res.txt","w")

lines = xlj.readlines()
x=[]
y=[]

zero=[]
for i in range(0,165):
    zero.append(0)

for line in lines:
    a=line.split(" ",1)[1].replace("[","").replace("]","").split(",")
    a = map(eval, a)

    if a!= zero:         #去掉没有词向量的特征向量
        x.append(a)
        try:
            b = int(line.split(" ", 1)[0])
            y.append(b)
        except:
            print lines.index(line)
            print line

xlj.close()
clf = svm.SVC(decision_function_shape='ovo')
clf.fit(x, y)


# print (len(x))
# print (len(y))
# print (len(a))
x_test=[]
y_test=[]

csj = codecs.open("s_s_csj_blc165.txt","r","utf-8-sig")
lines = csj.readlines()

print(len(lines))

for line in lines:
    a=line.split(" ",1)[1].replace("[","").replace("]","").split(",")
    a = map(eval, a)

    if a!= zero:         #去掉没有词向量的特征向量
        x_test.append(a)

        b = int(line.split(" ", 1)[0])
        y_test.append(b)    


print len(x_test)
print len(y_test)
print x_test
# y_hat = clf.predict(x)
print clf.score(x_test, y_test)  #全部分类准确率
print clf.score(x, y)

le,ai,nu,jing,wu=0,0,0,0,0   #五类分类准确率
sumle,sumai,sumnu,sumjing,sumwu=0,0,0,0,0
for i in range(len(x_test)):
    if y_test[i]==0:
        sumle+=1
        if clf.predict([x_test[i]])==y_test[i] :                
            le=le+1
    elif y_test[i]==1:
        sumai+=1
        if clf.predict([x_test[i]])==y_test[i] :                
            ai+=1
    elif y_test[i]==2:
        sumnu+=1
        if clf.predict([x_test[i]])==y_test[i] :                
            nu+=1
        
    elif y_test[i]==3:
        sumjing+=1
        if clf.predict([x_test[i]])==y_test[i] :                
            jing+=1
    elif y_test[i]==4:
        sumwu+=1
        if clf.predict([x_test[i]])==y_test[i] :                
            wu+=1
print(le/sumle,ai/sumai,nu/sumnu,jing/sumjing,wu/sumwu)
print(le,ai,nu,jing,wu)
print(sumle,sumai,sumnu,sumjing,sumwu)




csj.close()
# for line in lines:
#     X=line.split(" ",1)[1].replace("[","").replace("]","").split(",")
#     X = map(eval, X)
#     res=clf.predict([X])
# #     print(clf.predict([X]))  # 预测
#     if clf.predict([X]).tolist()[0] == int(line.split(" ",1)[0]):
#         cnt=cnt+1
#         print(line.split(" ",1)[0])
# print cnt


    
    

 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值