1.综述
我们最近遇到了一个问题,即是给定一个短句子,判断句子是positive还是negative的类似于情感二分类的问题。因为处于学习的初步阶段我们解决这个问题采用的机器学习方法是简单的全连接神经网络,并且手打了这个神经网络。
首先,接收一句话,使用jieba进行分词,把分词结果去掉其中的停止词,之后查词向量表把每一个词转化为一个64维的词向量,并且为了计算方便,把句子规范为有10个词(词向量)的矩阵,对于短于10个词的句子在矩阵的后几列补零向量做补齐,对于长于10个词的句子,则采用截取前10个词的方法。
之后,将这句话生成的64×10的矩阵进行拉伸,组装成一个640维的向量作为输入,输入到神经网络中,相应的神经网络的输入层有640个神经元用于接受输入,隐层神经元的个数可以根据最终的效果进行调整,输出层为两个神经元,分别表示预测结果1(positive)或0(negative)。
需要的文件https://pan.baidu.com/s/14vsD8QJFZHZRGAiSzJb-Og
![](https://i-blog.csdnimg.cn/blog_migrate/756af563e0bb74e58874c40431b50958.png)
2.训练神经网络
2.1生成词向量
“sentiment_XS_30k.txt”中存储了29000多条数据,这些数据是已经分好词,标注好,并且去掉了停止词的句子,可以作为我们的训练数据集。首先,把句子中的每个词转化为64维的词向量,之后为了计算方便,把每个句子规范为有10个词(词向量)的矩阵,对于短于10个词的句子在矩阵的后几列补零向量做补齐,对于长于10个词的句子,则采用截取前10个词的方法。最后把每一个句子的标注(1代表postive,0代表negative),和生成的词向量一起拼接成一行,存储在“word2vector.txt”中。创建deal2vector.py,代码如下:
import gensim
import numpy as np
vector_model_path = 'news_12g_baidubaike_20g_novel_90g_embedding_64.bin'
vector_model = gensim.models.KeyedVectors.load_word2vec_format(vector_model_path, binary=True, limit=100000)
def word2vector():
f = open('word2vector.txt', 'w+')
with open("sentiment_XS_30k.txt", encoding="utf-8") as fil:
# f = open('wector.txt', 'w+')
# with open("sentiment_XS_test.txt", encoding="utf-8") as fil:
for i in range(29312):
textlint = fil.readline()
# 保存label
predict = []
label = textlint.split(",", 1)[0]
if label == "positive":
predict.append(1)
elif label == "negative":
predict.append(0)
else:
print("error!")
# 生成词向量
voclist = textlint.split(",", 1)[1].split(" ")
x = [v.strip() for v in voclist] # 句子分词的list
vectors = [] # 词向量list
for word in x:
try:
vectors.append(vector_model[word])
except Exception as e:
# 词向量表中未出现的词,其向量用零向量代替
vectors.append(np.zeros(64))
if len(vectors) < 10:
# 句子的词少于10个
vectors.append(np.zeros(64*(10-len(vectors))))
vector = numextract(vectors)
training_data = predict + vector
f.write(str(training_data)+'\n')
if len(vectors) > 10:
# 句子的词少于10个
vector = numextract(vectors)[0: 640]
training_data = predict + vector
f.write(str(training_data) + '\n')
def numextract(lists):
vector = []
for i in range(len(lists)):
for t in range(len(lists[i])):
vector.append(lists[i].item(t))
return vector
if __name__ == "__main__":
word2vector()
2.2训练模型并存储参数
读取刚刚生成的“word2vector.txt”,并把这些数据装配成神经网络可接受的形式。创建load_data.py,代码如下:
import numpy as np
# 训练集
def dealData():
res_data = []
res_label = []
with open("word2vector.txt") as f:
lines = f.readlines()
for line in lines:
line = line.replace("[", "")
line = line.replace("]", "")
str = line.split(',', 1)
# 提取标签
res_label.append(int(str[0].strip()))
str_data = str[1].split(', ')
data = [float(x.strip()) for x in str_data]
res_data.append(np.array(data))
f.close()
return np.array(res_data), np.array(res_label)
def load_data_training():
# 修改数据输入:
tr0, tr1 = dealData()
training_inputs = [np.reshape(x, (640, 1)) for x in tr0]
training_results = [vectorized_result(y) for y in tr1]
training_data = list(zip(training_inputs, training_results))
return (training_data)
def vectorized_result(j):
e = np.zeros((2, 1))
e[j] = 1.0
return e
使用训练集数据训练神经网络,模型迭代十次,并把最终的weghts和biases存下来。创建network.py,其中“import src.integration.load_data as loader”为刚刚创建的load_data.py存储位置,代码如下:
import random
import src.integration.load_data as loader
import numpy as np
import pickle
class Network(object):
def __init__(self, sizes):
self.num_layers = len(sizes)
self.sizes = sizes
self.biases = [np.random.randn(y, 1) for y in sizes[1:]]
self.weights = [np.random.randn(y, x)
for x, y in zip(sizes[:-1], sizes[1:])]
def feedforward(self, a):
for b, w in zip(self.biases, self.weights):
a = sigmoid(np.dot(w, a)+b)
return a
def SGD(self, training_data, epochs, mini_batch_size, eta):
n = len(training_data)
for j in range(epochs):
random.shuffle(training_data)
mini_batches = [
training_data[k:k+mini_batch_size]
for k in range(0, n, mini_batch_size)]
for mini_batch in mini_batches:
self.update_mini_batch(mini_batch, eta)
print("epochs:{}".format(j+1))
def update_mini_batch(self, mini_batch, eta):
nabla_b = [np.zeros(b.shape) for b in self.biases]
nabla_w = [np.zeros(w.shape) for w in self.weights]
for x, y in mini_batch:
delta_nabla_b, delta_nabla_w = self.backprop(x, y)
nabla_b = [nb+dnb for nb, dnb in zip(nabla_b, delta_nabla_b)]
nabla_w = [nw+dnw for nw, dnw in zip(nabla_w, delta_nabla_w)]
self.weights = [w-(eta/len(mini_batch))*nw
for w, nw in zip(self.weights, nabla_w)]
self.biases = [b-(eta/len(mini_batch))*nb
for b, nb in zip(self.biases, nabla_b)]
def backprop(self, x, y):
nabla_b = [np.zeros(b.shape) for b in self.biases]
nabla_w = [np.zeros(w.shape) for w in self.weights]
# feedforward
activation = x
activations = [x] # list to store all the activations, layer by layer
zs = [] # list to store all the z vectors, layer by layer
for b, w in zip(self.biases, self.weights):
z = np.dot(w, activation)+b
zs.append(z)
activation = sigmoid(z)
activations.append(activation)
# backward pass
delta = self.cost_derivative(activations[-1], y) * \
sigmoid_prime(zs[-1])
nabla_b[-1] = delta
nabla_w[-1] = np.dot(delta, activations[-2].transpose())
for l in range(2, self.num_layers):
z = zs[-l]
sp = sigmoid_prime(z)
delta = np.dot(self.weights[-l+1].transpose(), delta) * sp
nabla_b[-l] = delta
nabla_w[-l] = np.dot(delta, activations[-l-1].transpose())
return (nabla_b, nabla_w)
def evaluate(self, test_data):
test_results = [(np.argmax(self.feedforward(x)), y)
for (x, y) in test_data]
return sum(int(x == y) for (x, y) in test_results)
def evaluate2(self, test_data):
test_results = [(np.argmax(self.feedforward(x)), y)
for (x, y) in test_data]
return [x for (x, y) in test_results]
def cost_derivative(self, output_activations, y):
return (output_activations-y)
def sigmoid(z):
return 1.0/(1.0+np.exp(-z))
def sigmoid_prime(z):
return sigmoid(z)*(1-sigmoid(z))
if __name__ == "__main__":
training_data = loader.load_data_training()
net = Network([640, 50, 2])
net.SGD(training_data, 10, 10, 3.0)
fw1 = open("weights.pkl", 'wb')
fw2 = open("biases.pkl", 'wb')
pickle.dump(net.weights, fw1)
pickle.dump(net.biases, fw2)
3.测试神经网络
“sentiment_XS_test.txt”中存储着我们需要的测试数据,其存储格式和处理方式跟上边的测试数据集一样。处理完成这些测试数据后,读取刚刚保存的weights和biases并应用到神经网络中。创建test.py
import src.integration.network as network
import pickle
import numpy as np
# 测试数据集
def dealTestData():
res_data = []
res_label = []
with open("wector.txt") as f:
lines = f.readlines()
for line in lines:
str = line.split(',', 1)
res_label.append(int(str[0].strip()))
str_data = str[1].split(', ')
data = [float(x.strip()) for x in str_data]
res_data.append(np.array(data))
f.close()
return np.array(res_data), np.array(res_label)
def load_data_test():
te0, te1 = dealTestData()
test_inputs = [np.reshape(x, (640, 1)) for x in te0]
test_data = list(zip(test_inputs, te1))
return (test_data)
if __name__ == "__main__":
test_data = load_data_test()
net = network.Network([640, 50, 2])
fr1 = open("weights.pkl", 'rb')
net.weights = pickle.load(fr1)
fr2 = open("biases.pkl", 'rb')
net.biases = pickle.load(fr2)
total = len(test_data)
right = net.evaluate(test_data)
print("rate:{}/{},{}%".format(right, total, (right*100/total)))
模型预测的正确率结果如下:
我们可以通过改变隐层的神经元数量,以及采用更加高级的神经网络来提高预测的正确率。
4.预测二分类
现在是真正的应用阶段了。首先,控制台可以接受一句话作为输入,然后使用jieba进行分词处理,之后这句话映射成10个64维的词向量,拉伸为640维的向量,输入神经网络做预测,最后输出结果。
import numpy as np
import pickle
import jieba
import gensim
import src.integration.network as network
def numextract(lists):
vector = []
for i in range(len(lists)):
for t in range(len(lists[i])):
vector.append(lists[i].item(t))
return vector
# 分词,生成词向量
def cin_test(vector_model, vocstr):
voclist = jieba.cut(vocstr)
x = [v.strip() for v in voclist]
vectors = []
for word in x:
try:
vectors.append(vector_model[word])
except Exception as e:
vectors.append(np.zeros(64))
if len(vectors) < 10:
vectors.append(np.zeros(64 * (10 - len(vectors))))
vector = numextract(vectors)
test_data = vector
return test_data
if len(vectors) > 10:
vector = numextract(vectors)[0: 640]
test_data = vector
return test_data
# 装配数据格式
def dealCinTestData(vocstr):
vector_model_path = 'news_12g_baidubaike_20g_novel_90g_embedding_64.bin'
vector_model = gensim.models.KeyedVectors.load_word2vec_format(vector_model_path, binary=True, limit=100000)
voc_list = cin_test(vector_model, vocstr)
clist = []
clist.append(np.array(voc_list))
cin_data = np.array(clist)
cin_label = np.array([0])
test_inputs = [np.reshape(x, (640, 1)) for x in cin_data]
test_data = list(zip(test_inputs, cin_label))
return test_data
# 输入神经网络进行测试
def test(vocstr):
net = network.Network([640, 50, 2])
fr1 = open("weights.pkl", 'rb')
fr2 = open("biases.pkl", 'rb')
net.weights = pickle.load(fr1)
net.biases = pickle.load(fr2)
test_data = dealCinTestData(vocstr)
return net.evaluate2(test_data)[0]
if __name__ == "__main__":
while True:
str = input("请输入:")
if test(str):
print("positive")
else:
print("negative")
结果如下: