支持向量机matlab代码程序_自然语言处理--支持向量机分类(理论+代码)

该博客介绍了支持向量机(SVM)的基本思想、优缺点,并展示了如何使用SVM进行文本情感分析。通过加载数据、分词、构建词向量并训练SVM模型,最终实现对电影评论的情感分类,正确率为84.91%。此外,还涉及了词向量的计算和对单个句子的情感判断方法。
摘要由CSDN通过智能技术生成

一、支持向量机

1.基本思想

支持向量机的最终目的是在特征空间中寻找到一个尽可能将两个数据集合分开的超级平面,之所以名字前面加了前缀“超级”,是因为我们的数据特征空间很有可能是高维度空间,而且我们希望这个超级平面能够尽可能大的将两个数据分开。

e3595ce12672bc25956e2e7b0e2a3dc7.png

2.优点

(1)可用于线性、非线性分类(核函数),也可以用于回归

(2)低泛化错误

(3)推导过程清晰,容易解释

(4)计算复杂度较低

3.缺点

(1)对参数和核函数的选择比较敏感

(2)原始的svm只适合处理二分类问题

4.理论推导

12e80b88edf5dca99ead479c1a4b982b.png

f274ad938a0c9dace161058f883be999.png

1f97a28aa6df2bbbc3832681fb8f5014.png

16ab3cb2350c697f0260b2b12647bb7d.png

二、效果

正确率 = 84.91%

cb01730aec2da499b4da0e78e5bbdf30.png

三、代码

from gensim.models.word2vec import Word2Vec
import numpy as np
import pandas as pd
import jieba
from sklearn.externals import joblib
from sklearn.svm import SVC
path = 'D:/SvmModel1.0/'

## 加载文件,导入数据,分词
def loadfile():
    neg=pd.read_excel(path+'data/neg.xls',header=None,index=None)
    pos=pd.read_excel(path+'data/pos.xls',header=None,index=None)
    y = np.concatenate((np.ones(len(pos)), np.zeros(len(neg))))#use 1 for positive sentiment, 0 for negative
    x_train0, x_test0, y_train, y_test = train_test_split(np.concatenate((pos[0], neg[0])), y, test_size=0.2)

    
    x_trainframe = pd.DataFrame(x_train0)
    x_testframe = pd.DataFrame(x_test0)
    cw = lambda x: list(jieba.cut(x))
    x_trainframe['words'] = x_trainframe[0].apply(cw)
    x_testframe['words'] = x_testframe[0].apply(cw)
    x_trainframe.to_excel(path+'data/x_trainframe.xlsx')
    x_testframe.to_excel(path+'data/x_testframe.xlsx')
    
    x_train = np.array(x_trainframe['words'])
    x_test = np.array(x_testframe['words'])
    return x_train, x_test, y_train, y_test, x_trainframe, x_testframe
 

## 对每个句子的所有词向量取均值
def buildWordVector(text, size, imdb_w2v):
    size = 300
    vec = np.zeros(size).reshape((1, size))
    count = 0.
    for word in text:
        try:
            vec += imdb_w2v[word].reshape((1, size))
            count += 1.
        except KeyError:
            continue
    if count != 0:
        vec /= count
    return vec


## 计算词向量
def get_train_vecs(x_train,x_test):
    n_dim = 300
    
    #Initialize model and build vocab
    imdb_w2v = Word2Vec(size=n_dim, min_count=10)
    imdb_w2v.build_vocab(x_train)
    
    #Train the model over train_reviews (this may take several minutes)
    imdb_w2v.train(sentences=x_train, total_examples=imdb_w2v.corpus_count, epochs=20)
    train_vecs = np.concatenate([buildWordVector(z, n_dim, imdb_w2v) for z in x_train])    
    np.save(path+'train_vecs.npy',train_vecs)
    print(train_vecs.shape)
    
    #Train word2vec on test tweets
    imdb_w2v.train(sentences=x_test, total_examples=imdb_w2v.corpus_count, epochs=20)
    imdb_w2v.save(path+'w2v_model.pkl')
    
    #Build test tweet vectors then scale
    test_vecs = np.concatenate([buildWordVector(z, n_dim,imdb_w2v) for z in x_test])
    
    #test_vecs = scale(test_vecs)
    np.save(path+'test_vecs.npy',test_vecs)
    print(test_vecs.shape)
    
    return train_vecs, test_vecs


## 训练svm模型
def svm_train(train_vecs, y_train, test_vecs, y_test):
    clf = SVC(kernel='rbf', verbose=True)
    clf.fit(train_vecs, y_train)
    joblib.dump(clf, path+'model.pkl')
    score = clf.score(test_vecs,y_test)
    print(score)
    return score


## 对单个句子进行情感判断    
if __name__=='__main__':  
    x_train, x_test, y_train, y_test, x_trainframe, x_testframe = loadfile()
    train_vecs, test_vecs = get_train_vecs(x_train,x_test)
    score = svm_train(train_vecs, y_train, test_vecs, y_test)
    
    #得到待预测单个句子的词向量    
    def get_predict_vecs(words):
        n_dim = 300
        imdb_w2v = Word2Vec.load(path+'w2v_model.pkl')
        train_vecs = buildWordVector(words, n_dim, imdb_w2v)
        return train_vecs
    
    text = pd.read_excel(path+'data/x_testframe.xlsx',header=0, index_col=0)
    strings = text[0]
    
    labels = []
    for string in strings:
        words = jieba.lcut(string)
        words_vecs=get_predict_vecs(words)
        clf = joblib.load(path+'model.pkl')
         
        result=clf.predict(words_vecs)
        labels.append(result[0])
        
        if int(result[0])==1:
            print(string,'Yes')
        else:
            print(string,'No')
    text['标记--真实'] = y_test
    text['标记--预测'] = labels
    text.to_excel(path+'text.xlsx')
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值