一、支持向量机
1.基本思想
支持向量机的最终目的是在特征空间中寻找到一个尽可能将两个数据集合分开的超级平面,之所以名字前面加了前缀“超级”,是因为我们的数据特征空间很有可能是高维度空间,而且我们希望这个超级平面能够尽可能大的将两个数据分开。
2.优点
(1)可用于线性、非线性分类(核函数),也可以用于回归
(2)低泛化错误
(3)推导过程清晰,容易解释
(4)计算复杂度较低
3.缺点
(1)对参数和核函数的选择比较敏感
(2)原始的svm只适合处理二分类问题
4.理论推导
二、效果
正确率 = 84.91%
三、代码
from gensim.models.word2vec import Word2Vec
import numpy as np
import pandas as pd
import jieba
from sklearn.externals import joblib
from sklearn.svm import SVC
path = 'D:/SvmModel1.0/'
## 加载文件,导入数据,分词
def loadfile():
neg=pd.read_excel(path+'data/neg.xls',header=None,index=None)
pos=pd.read_excel(path+'data/pos.xls',header=None,index=None)
y = np.concatenate((np.ones(len(pos)), np.zeros(len(neg))))#use 1 for positive sentiment, 0 for negative
x_train0, x_test0, y_train, y_test = train_test_split(np.concatenate((pos[0], neg[0])), y, test_size=0.2)
x_trainframe = pd.DataFrame(x_train0)
x_testframe = pd.DataFrame(x_test0)
cw = lambda x: list(jieba.cut(x))
x_trainframe['words'] = x_trainframe[0].apply(cw)
x_testframe['words'] = x_testframe[0].apply(cw)
x_trainframe.to_excel(path+'data/x_trainframe.xlsx')
x_testframe.to_excel(path+'data/x_testframe.xlsx')
x_train = np.array(x_trainframe['words'])
x_test = np.array(x_testframe['words'])
return x_train, x_test, y_train, y_test, x_trainframe, x_testframe
## 对每个句子的所有词向量取均值
def buildWordVector(text, size, imdb_w2v):
size = 300
vec = np.zeros(size).reshape((1, size))
count = 0.
for word in text:
try:
vec += imdb_w2v[word].reshape((1, size))
count += 1.
except KeyError:
continue
if count != 0:
vec /= count
return vec
## 计算词向量
def get_train_vecs(x_train,x_test):
n_dim = 300
#Initialize model and build vocab
imdb_w2v = Word2Vec(size=n_dim, min_count=10)
imdb_w2v.build_vocab(x_train)
#Train the model over train_reviews (this may take several minutes)
imdb_w2v.train(sentences=x_train, total_examples=imdb_w2v.corpus_count, epochs=20)
train_vecs = np.concatenate([buildWordVector(z, n_dim, imdb_w2v) for z in x_train])
np.save(path+'train_vecs.npy',train_vecs)
print(train_vecs.shape)
#Train word2vec on test tweets
imdb_w2v.train(sentences=x_test, total_examples=imdb_w2v.corpus_count, epochs=20)
imdb_w2v.save(path+'w2v_model.pkl')
#Build test tweet vectors then scale
test_vecs = np.concatenate([buildWordVector(z, n_dim,imdb_w2v) for z in x_test])
#test_vecs = scale(test_vecs)
np.save(path+'test_vecs.npy',test_vecs)
print(test_vecs.shape)
return train_vecs, test_vecs
## 训练svm模型
def svm_train(train_vecs, y_train, test_vecs, y_test):
clf = SVC(kernel='rbf', verbose=True)
clf.fit(train_vecs, y_train)
joblib.dump(clf, path+'model.pkl')
score = clf.score(test_vecs,y_test)
print(score)
return score
## 对单个句子进行情感判断
if __name__=='__main__':
x_train, x_test, y_train, y_test, x_trainframe, x_testframe = loadfile()
train_vecs, test_vecs = get_train_vecs(x_train,x_test)
score = svm_train(train_vecs, y_train, test_vecs, y_test)
#得到待预测单个句子的词向量
def get_predict_vecs(words):
n_dim = 300
imdb_w2v = Word2Vec.load(path+'w2v_model.pkl')
train_vecs = buildWordVector(words, n_dim, imdb_w2v)
return train_vecs
text = pd.read_excel(path+'data/x_testframe.xlsx',header=0, index_col=0)
strings = text[0]
labels = []
for string in strings:
words = jieba.lcut(string)
words_vecs=get_predict_vecs(words)
clf = joblib.load(path+'model.pkl')
result=clf.predict(words_vecs)
labels.append(result[0])
if int(result[0])==1:
print(string,'Yes')
else:
print(string,'No')
text['标记--真实'] = y_test
text['标记--预测'] = labels
text.to_excel(path+'text.xlsx')
该博客介绍了支持向量机(SVM)的基本思想、优缺点,并展示了如何使用SVM进行文本情感分析。通过加载数据、分词、构建词向量并训练SVM模型,最终实现对电影评论的情感分类,正确率为84.91%。此外,还涉及了词向量的计算和对单个句子的情感判断方法。
890

被折叠的 条评论
为什么被折叠?



