摘要:keras开发,tf为后端;采用了两个样本(ChnSentiCorp_htl_ba_2000与imdb),三个神经网络的试探性运行(全连接的一般神经网络NN,LSTM,CNN),感觉keras比tf写代码更简单。对于NN只要参数充够的多,会拟合得很好,不过这样产生了过拟合;LSTM比CNN运行的效果好很多。
keras的中文文档: http://keras-cn.readthedocs.io/en/latest/
NN
下载数据
Downloading data from https://s3.amazonaws.com/text-datasets/imdb_full.pkl , 运行imdb.load_data()将会下载。
这里面的数据,是一个整数矩阵,每个整数代表一个词,这里对每个词做了一个整型编码了。
# 探索一下数据情况
# -*- coding: utf-8 -*-
import matplotlib.pyplot as plt
import numpy as np
from keras.datasets import imdb
from keras.layers import Embedding, Flatten, Dense
from keras.models import Sequential
from keras.preprocessing import sequence
## EDA
# 加载数据,这个数据来自: https://s3.amazonaws.com/text-datasets/imdb_full.pkl
(x_train, y_train), (x_test, y_test) = imdb.load_data()
# 探索一下数据情况
lens = list(map(len, x_train))
avg_len = np.mean(lens)
print(avg_len)
plt.hist(lens, bins=range(min(lens), max(lens) + 50, 50))
plt.show()
# 由于长度不同,这里取相同的长度
m = max(max(list(map(len, x_train))), max(list(map(len, x_test))))
print('m=%d' % m)
maxword = min(400, m)
x_train = sequence.pad_sequences(x_train, maxlen=maxword)
x_test = sequence.pad_sequences(x_test, maxlen=maxword)
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)
#词数
vocab_siz = np.max([np.max(x_train[i]) for i in range(x_train.shape[0])]) + 1
print('vocab_siz=%d' % vocab_siz)
print('x_train.shape=[%d,%d]' % (x_train.shape[0], x_train.shape[1]))
#构建模型
model = Sequential()
# 第一层是嵌入层,矩阵为 vocab_siz * 64
model.add(Embedding(vocab_siz, 64, input_length=maxword))
# 把矩阵压平,变成vocab_siz * 64维的向量
model.add(Flatten())
# 加入多层全连接
model.add(Dense(2000, activation='relu'))
model.add(Dense(500, activation='relu'))
model.add(Dense(200, activation='relu'))
model.add(Dense(50, activation='relu'))
# 最后一层输进0~1之间的值,像lr那样
model.add(Dense(1, activation='sigmoid'))
# 计算
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())
print(type(x_train))
#训练
model.fit(x_train, y_train, validation_data=(x_test, y_test), batch_size=100, nb_epoch=20, verbose=1)
score = model.evaluate(x_test, y_test)
print(score)
控制台显示:
________________________
Layer (type) Output Shape Param # Connected to
====================================================================================================
embedding_1 (Embedding) (None, 400, 64) 5669568 embedding_input_1[0][0]
____________________________________________________________________________________________________
flatten_1 (Flatten) (None, 25600) 0 embedding_1[0][0]
____________________________________________________________________________________________________
dense_1 (Dense) (None, 2000) 51202000 flatten_1[0][0]
____________________________________________________________________________________________________
dense_2 (Dense) (None, 500) 1000500 dense_1[0][0]
____________________________________________________________________________________________________
dense_3 (Dense) (None, 200) 100200 dense_2[0][0]
____________________________________________________________________________________________________
dense_4 (Dense) (None, 50) 10050 dense_3[0][0]
____________________________________________________________________________________________________
dense_5 (Dense) (None, 1) 51 dense_4[0][0]
====================================================================================================
Total params: 57,982,369
Trainable params: 57,982,369
Non-trainable params: 0
Epoch 1/20
25000/25000 [==============================] - 452s - loss: 0.4248 - acc: 0.7779 - val_loss: 0.2961 - val_acc: 0.8768
Epoch 2/20
25000/25000 [==============================] - 458s - loss: 0.0779 - acc: 0.9730 - val_loss: 0.4230 - val_acc: 0.8503
Epoch 3/20
25000/25000 [==============================] - 450s - loss: 0.0050 - acc: 0.9985 - val_loss: 0.7284 - val_acc: 0.8522
Epoch 4/20
25000/25000 [==============================] - 452s - loss: 0.0031 - acc: 0.9990 - val_loss: 0.9187 - val_acc: 0.8420
Epoch 5/20
25000/25000 [==============================] - 449s - loss: 0.0052 - acc: 0.9982 - val_loss: 1.0336 - val_acc: 0.8362
LSTM
语料的样子,网上找到的情感分析语料“ChnSentiCorp_htl_ba_2000”
上成的例子中,显示出来的数据都是一整数,这个整数是词的id,上面把构建 id的过程省略了,这个LSTM的例子把这个词转id的过程补回来;同时,把对这个语料的处理的过程也补上,首先把pos与neg文件合并成一个文档,然后再把这两个合成一个文档,分词。
把这个分好词的文档运用Gensim构建词典,形成id与词的一个映射,转成以整数为id的向量矩阵。
探索词的维数据,选择合适的维度,如果长句进行截断,如果是短句进行填充。
处理完就可以构建LSTM了,训练,评估。。。。
代码:
# -*- coding:utf-8-*-
import os
import re
import numpy as np
import matplotlib.pyplot as plt
# 分词
from pprint import pprint
import jieba
from bs4 import BeautifulSoup
from gensim import corpora
from keras.layers import Embedding, LSTM, Dense, Activation, Dropout
from keras.models import Sequential
from keras.preprocessing import sequence
from sklearn.cross_validation import train_test_split
def cutPhase(inFile, outFile):
# 如果没有自己定义的词典,这行不要
# jieba.load_userdict("dict_all.txt")
# 加载停用词
stoplist = {}.fromkeys([line.strip() for line in open('data/stopword.txt', 'r', encoding='utf-8')])
f1 = open(inFile, 'r', encoding='utf-8')
f2 = open(outFile, 'w+', encoding='utf-8')
line = f1.readline()
count = 0
while line:
b = BeautifulSoup(line, "lxml")
line = b.text
# 分词
segs = jieba.cut(line, cut_all=False)
# 过滤停用词
segs = [word for word in list(segs)
if word.lstrip() is not None
and word.lstrip() not in stoplist
]
# 每个词用空格隔开
f2.write(" ".join(segs))
f2.write('\n')
line = f1.readline()
count += 1
if count % 100 == 0:
print(count)
f1.close()
f2.close()
def load_data(out_pos_name='data/pos.txt', out_neg_name='data/neg.txt'):
def do_load(file_name, dir):
c = 0
with open(file_name, 'w+', encoding='utf-8') as f_out:
for root, _, files in os.walk(dir):
# print(root)
for f_name in files:
p = os.path.join(root, f_name)
try:
with open(p, mode='r', encoding='gbk') as f_read:
# print(os.path.join(root, f_name))
c += 1
txt = f_read.read()
txt = re.subn(r'\s+', ' ', txt)[0]
f_out.write('%s\n' % (txt))
# if c % 100 == 0:
# print(c)
except Exception as e:
print('p:', p)
# print('e:',e)
print('加载pos!!!')
do_load(out_pos_name,
'data/ChnSentiCorp_htl_ba_2000/pos')
print('加载neg!!!')
do_load(out_neg_name,
'data/ChnSentiCorp_htl_ba_2000/neg')
def combine_data():
c = 0
f_w = open('data/train.cut', 'w+', encoding='utf-8')
f_pos = open('data/pos.cut', 'r', encoding='utf-8')
line = f_pos.readline()
while line:
c += 1
f_w.write('%d\t%s' % (1, line))
line = f_pos.readline()
print(c)
f_pos.close()
f_neg = open('data/neg.cut', 'r', encoding='utf-8')
line = f_neg.readline()
while line:
c += 1
f_w.write('%d\t%s' % (0, line))
line = f_neg.readline()
print(c)
f_neg.close()
f_w.close()
if __name__ == '__main__':
# print('# 加载数据')
# load_data(out_pos_name='data/pos.txt', out_neg_name='data/neg.txt')
# print('# 分词')
# cutPhase(inFile='data/pos.txt', outFile='data/pos.cut')
# cutPhase(inFile='data/neg.txt', outFile='data/neg.cut')
# 数据融合
# combine_data()
Y = []
x = []
for line in open('data/train.cut', encoding='utf-8'):
label, sentence = line.split("\t")
Y.append(int(label))
x.append(sentence.split())
print('#构建字典')
dic = corpora.Dictionary(x)
X = []
for row in x:
tmp = []
for w_i in row:
tmp.append(dic.token2id[w_i])
X.append(tmp)
X = np.array(X)
Y = np.array(Y)
# lens = list(map(len, X))
# avg_len = np.mean(lens)
# print(avg_len)
# plt.hist(lens, bins=range(min(lens), max(lens) + 50, 50))
# plt.show()
# 由于长度不同,这里取相同的长度,平均长度为38.18,最大长度为337.
m = max(list(map(len, X)))
print('m=%d' % m)
maxword = min(100, m)
X = sequence.pad_sequences(X, maxlen=maxword)
print(X.shape)
## 数据划分
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
# 构建模型
model = Sequential()
model.add(Embedding(len(dic) + 1, 128, input_length=maxword))
# model.add(LSTM(128, dropout_W=0.2, return_sequences=True))
# model.add(LSTM(64, dropout_W=0.2,return_sequences=True))
model.add(LSTM(128, dropout_W=0.2))
model.add(Dense(1))
model.add(Activation("sigmoid"))
# 计算
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
print(model.summary())
# 进行训练
model.fit(x_train, y_train, batch_size=100, nb_epoch=10, validation_data=(x_test, y_test))
## 结果评估
score, acc = model.evaluate(x_test, y_test, batch_size=100)
print("score: %.3f, accuracy: %.3f" % (score, acc))
# # 预测
# my_sentences = ['讨厌 房间']
# my_x = []
# for s in my_sentences:
# words = s.split()
# tmp = []
# for w_j in words:
# tmp.append(dic.token2id[w_j])
# my_x.append(tmp)
# my_X = np.array(my_x)
# my_X = sequence.pad_sequences(my_X, maxlen=maxword)
# labels = [int(round(x[0])) for x in model.predict(my_X)]
# for i in range(len(my_sentences)):
# print('%s:%s' % ('正面' if labels[i] == 1 else '负面', my_sentences[i]))
# 这里面没有考虑到字典没有的词,这个作为下一个版本改进点。
LSTM模型:
____________________________________________________________________________________________________
Layer (type) Output Shape Param # Connected to
====================================================================================================
embedding_1 (Embedding) (None, 100, 64) 748800 embedding_input_1[0][0]
____________________________________________________________________________________________________
lstm_1 (LSTM) (None, 100, 128) 98816 embedding_1[0][0]
____________________________________________________________________________________________________
lstm_2 (LSTM) (None, 100, 64) 49408 lstm_1[0][0]
____________________________________________________________________________________________________
lstm_3 (LSTM) (None, 32) 12416 lstm_2[0][0]
____________________________________________________________________________________________________
dense_1 (Dense) (None, 1) 33 lstm_3[0][0]
____________________________________________________________________________________________________
activation_1 (Activation) (None, 1) 0 dense_1[0][0]
====================================================================================================
Total params: 909,473
Trainable params: 909,473
score: 0.572, accuracy: 0.859
____________________________________________________________________________________________________
Layer (type) Output Shape Param # Connected to
====================================================================================================
embedding_1 (Embedding) (None, 100, 64) 748800 embedding_input_1[0][0]
____________________________________________________________________________________________________
lstm_1 (LSTM) (None, 128) 98816 embedding_1[0][0]
____________________________________________________________________________________________________
dense_1 (Dense) (None, 1) 129 lstm_1[0][0]
____________________________________________________________________________________________________
activation_1 (Activation) (None, 1) 0 dense_1[0][0]
====================================================================================================
Total params: 847,745
Trainable params: 847,745
Non-trainable params: 0
score: 0.302, accuracy: 0.871
____________________________________________________________________________________________________
Layer (type) Output Shape Param # Connected to
====================================================================================================
embedding_1 (Embedding) (None, 100, 128) 1497600 embedding_input_1[0][0]
____________________________________________________________________________________________________
lstm_1 (LSTM) (None, 128) 131584 embedding_1[0][0]
____________________________________________________________________________________________________
dense_1 (Dense) (None, 1) 129 lstm_1[0][0]
____________________________________________________________________________________________________
activation_1 (Activation) (None, 1) 0 dense_1[0][0]
====================================================================================================
Total params: 1,629,313
Trainable params: 1,629,313
Non-trainable params: 0
score: 0.386, accuracy: 0.874
CNN
对上面的LSTM模型结构考虑使用CNN,主要修改神经网络模型。
# 构建模型CNN
model = Sequential()
model.add(Embedding(len(dic) + 1, 128, input_length=maxword))
model.add(Conv1D(nb_filter=128, filter_length=5, border_mode='same', activation='relu'))
model.add(MaxPooling1D(pool_length=2))
model.add(Dropout(0.25))
model.add(Conv1D(nb_filter=128, filter_length=3, border_mode='same', activation='relu'))
model.add(MaxPooling1D(pool_length=2))
model.add(Dropout(0.25))
model.add(Flatten())
model.add(Dense(64, activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='relu'))
# 计算
model.compile(loss="binary_crossentropy", optimizer="rmsprop", metrics=["accuracy"])
print(model.summary())
# 进行训练
model.fit(x_train, y_train, batch_size=100, nb_epoch=20, validation_data=(x_test, y_test))
## 结果评估
score, acc = model.evaluate(x_test, y_test, batch_size=100, verbose=1)
print("score: %.3f, accuracy: %.3f" % (score, acc))
____________________________________________________________________________________________________
Layer (type) Output Shape Param # Connected to
====================================================================================================
embedding_1 (Embedding) (None, 100, 128) 1497600 embedding_input_1[0][0]
____________________________________________________________________________________________________
convolution1d_1 (Convolution1D) (None, 100, 128) 82048 embedding_1[0][0]
____________________________________________________________________________________________________
maxpooling1d_1 (MaxPooling1D) (None, 50, 128) 0 convolution1d_1[0][0]
____________________________________________________________________________________________________
dropout_1 (Dropout) (None, 50, 128) 0 maxpooling1d_1[0][0]
____________________________________________________________________________________________________
convolution1d_2 (Convolution1D) (None, 50, 128) 49280 dropout_1[0][0]
____________________________________________________________________________________________________
maxpooling1d_2 (MaxPooling1D) (None, 25, 128) 0 convolution1d_2[0][0]
____________________________________________________________________________________________________
dropout_2 (Dropout) (None, 25, 128) 0 maxpooling1d_2[0][0]
____________________________________________________________________________________________________
flatten_1 (Flatten) (None, 3200) 0 dropout_2[0][0]
____________________________________________________________________________________________________
dense_1 (Dense) (None, 64) 204864 flatten_1[0][0]
____________________________________________________________________________________________________
dense_2 (Dense) (None, 32) 2080 dense_1[0][0]
____________________________________________________________________________________________________
dense_3 (Dense) (None, 1) 33 dense_2[0][0]
====================================================================================================
Total params: 1,835,905
Trainable params: 1,835,905
Non-trainable params: 0
____________________________________________________________________________________________________
这里主是为了测试,效果不怎么样。没有前面的LSTM好,考虑研究改进,或考虑学习CNNText…
[happyprince, http://blog.csdn.net/ld326/article/details/78670821]