Keras官网:Keras: Deep Learning for humans
Keras中文手册:主页 - Keras 中文文档
Keras安装:
pip install keras --break-system #不推荐
Arch Linux:
sudo pacman -S python-keras
Debian:
sudo apt install python-keras
RHEL:
sudo yum install python-keras
注:需要先下载Tensorflow或Theano作为后端,这里是用Tensorflow
首先,导入需要用到的库:
from tensorflow import keras
from tensorflow.keras.layers import *
from tensorflow.keras.optimizers import *
from tensorflow.keras.losses import *
from tensorflow.keras.models import *
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
import pandas as pd
import jieba
6-7行图像预处理必须使用Tensorflow导入,Keras没有这个模块
使用Arch Linux如果想要使用Nvidia显卡作为加速的话,需要先下载驱动和cuda
sudo pacman -S nvidia cuda
reboot
然后导入数据:
#使用Pandas读取数据
data = pd.read_csv('datasets/earphones.csv')
#使用jieba切割数据
cw = lambda x: list(jieba.cut(x))
data['content'] = data['content'].apply(cw)
#将每个词用Space分割
texts = [' '.join(x) for x in data['content']]
这是可以测试一下:
max_doc_leg = max(len(x) for x in data['content'])
print(max_doc_leg)
print(texts[2])
可以得到如下结果:
1651
'达音科 17 周年 倒 是 数据 最 好看 , 而且 便宜'
然后是进一步对数据进行预处理:
tokenizer = Tokenizer(num_words=30000)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
sequences = pad_sequences(sequences, maxlen=1700, padding='post')
sequences = np.array(sequences)
dict_text = tokenizer.word_index
poslen, neglen = 0, 0
leg = data['sentiment_value']
for i in range(len(data['sentiment_value'])):
if leg[i] == 1:
poslen += 1
else:
neglen += 1
positive_label = [[0, 1] for _ in range(poslen)]
negative_label = [[1, 0] for _ in range(neglen)]
y = np.concatenate([positive_label, negative_label], 0)
np.random.seed(int(time()))
shuffle_indices = np.random.permutation(np.arange(len(y)))
x_shuffled = sequences[shuffle_indices]
y_shuffled = y[shuffle_indices]
test_sample_index = -1 * int(0.1 * float(len(y)))
x_train, x_test = x_shuffled[:test_sample_index], x_shuffled[test_sample_index:]
y_train, y_test = y_shuffled[:test_sample_index], y_shuffled[test_sample_index:]
然后构建卷积网络:
sequence_input = Input(shape=(1700, ))
embedding_layer = Embedding(30000, 128, input_length=1700)
embedding_sequences = embedding_layer(sequence_input)
cnn1 = Conv1D(filters=32, kernel_size=3, activation='relu')(embedding_sequences)
cnn1 = MaxPooling1D(pool_size=5)(cnn1)
cnn1 = Conv1D(filters=32, kernel_size=3, activation='relu')(cnn1)
cnn1 = MaxPooling1D(pool_size=5)(cnn1)
cnn1 = Conv1D(filters=32, kernel_size=3, activation='relu')(cnn1)
cnn1 = MaxPooling1D(pool_size=37)(cnn1)
cnn1 = Flatten()(cnn1)
cnn2 = Conv1D(filters=32, kernel_size=4, activation='relu')(embedding_sequences)
cnn2 = MaxPooling1D(pool_size=5)(cnn2)
cnn2 = Conv1D(filters=32, kernel_size=4, activation='relu')(cnn2)
cnn2 = MaxPooling1D(pool_size=5)(cnn2)
cnn2 = Conv1D(filters=32, kernel_size=4, activation='relu')(cnn2)
cnn2 = MaxPooling1D(pool_size=36)(cnn2)
cnn2 = Flatten()(cnn2)
cnn3 = Conv1D(filters=32, kernel_size=5, activation='relu')(embedding_sequences)
cnn3 = MaxPooling1D(pool_size=5)(cnn3)
cnn3 = Conv1D(filters=32, kernel_size=5, activation='relu')(cnn3)
cnn3 = MaxPooling1D(pool_size=5)(cnn3)
cnn3 = Conv1D(filters=32, kernel_size=5, activation='relu')(cnn3)
cnn3 = MaxPooling1D(pool_size=35)(cnn3)
cnn3 = Flatten()(cnn3)
merge = concatenate([cnn1, cnn2, cnn3], axis=1)
x = Dense(128, activation='relu')(merge)
x = Dropout(0.5)(x)
preds = Dense(2, activation='softmax')(x)
model = Model(sequence_input, preds)
这里是用函数式模型,当然你也可以用顺序模型
model = Sequential(
Dense(),
Dense()
...
)
最后就是编译并训练模型了:
model.compile(
optimizer=Adam(),
loss=categorical_crossentropy,
metrics=['accuracy']
)
model.fit(x_train, y_train, batch_size=256, epochs=10, validation_data=(x_test, y_test))
这里使用Adam作为优化器,损失函数为交叉熵,也可以用均方误差(mse)
由于数据很多,而且Jupyter内找不到GPU,所以只能用CPU,发热严重,且训练缓慢
大概10min后,运行就完成了,我们可以定义一个函数来进行预测:
def I_think(content):
cw = list(jieba.cut(content))
word_id = []
for word in cw:
try:
temp = dict_text[word]
word_id.append(temp)
except:
word_id.append(0)
word_id = np.array(word_id)
word_id = word_id[np.newaxis, :]
sequences = pad_sequences(word_id, maxlen=1700, padding='post')
#result = np.argmax(model.predict(sequences))
result = model.predict(sequences)
if result[0][1] < 0.3:
print(f"欢迎下次购买~({result[0][1]})")
elif result[0][1] > 0.3:
print(f"不喜欢就滚蛋!({result[0][1]})")
调用这个函数:
I_think('音质很好,推荐购买')
结果:
1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 18ms/step
欢迎下次购买~(5.5004918976919726e-05)
可以使用save保存模型,下次运行就不用训练了:
model.save('path/to/nlp_cnn.h5')
下次使用时,把编译和训练内容换成以下内容即可
model = load_model('path/to/nlp_cnn.h5')
最后是完整代码:
from tensorflow import keras
from tensorflow.keras.layers import *
from tensorflow.keras.optimizers import *
from tensorflow.keras.losses import *
from tensorflow.keras.models import *
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from time import time
import numpy as np
import pandas as pd
import jieba
data = pd.read_csv('datasets/earphones.csv')
cw = lambda x: list(jieba.cut(x))
data['content'] = data['content'].apply(cw)
texts = [' '.join(x) for x in data['content']]
tokenizer = Tokenizer(num_words=30000)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
sequences = pad_sequences(sequences, maxlen=1700, padding='post')
sequences = np.array(sequences)
dict_text = tokenizer.word_index
np.random.seed(int(time()))
shuffle_indices = np.random.permutation(np.arange(len(y)))
x_shuffled = sequences[shuffle_indices]
y_shuffled = y[shuffle_indices]
test_sample_index = -1 * int(0.1 * float(len(y)))
x_train, x_test = x_shuffled[:test_sample_index], x_shuffled[test_sample_index:]
y_train, y_test = y_shuffled[:test_sample_index], y_shuffled[test_sample_index:]
sequence_input = Input(shape=(1700, ))
embedding_layer = Embedding(30000, 128, input_length=1700)
embedding_sequences = embedding_layer(sequence_input)
cnn1 = Conv1D(filters=32, kernel_size=3, activation='relu')(embedding_sequences)
cnn1 = MaxPooling1D(pool_size=5)(cnn1)
cnn1 = Conv1D(filters=32, kernel_size=3, activation='relu')(cnn1)
cnn1 = MaxPooling1D(pool_size=5)(cnn1)
cnn1 = Conv1D(filters=32, kernel_size=3, activation='relu')(cnn1)
cnn1 = MaxPooling1D(pool_size=37)(cnn1)
cnn1 = Flatten()(cnn1)
cnn2 = Conv1D(filters=32, kernel_size=4, activation='relu')(embedding_sequences)
cnn2 = MaxPooling1D(pool_size=5)(cnn2)
cnn2 = Conv1D(filters=32, kernel_size=4, activation='relu')(cnn2)
cnn2 = MaxPooling1D(pool_size=5)(cnn2)
cnn2 = Conv1D(filters=32, kernel_size=4, activation='relu')(cnn2)
cnn2 = MaxPooling1D(pool_size=36)(cnn2)
cnn2 = Flatten()(cnn2)
cnn3 = Conv1D(filters=32, kernel_size=5, activation='relu')(embedding_sequences)
cnn3 = MaxPooling1D(pool_size=5)(cnn3)
cnn3 = Conv1D(filters=32, kernel_size=5, activation='relu')(cnn3)
cnn3 = MaxPooling1D(pool_size=5)(cnn3)
cnn3 = Conv1D(filters=32, kernel_size=5, activation='relu')(cnn3)
cnn3 = MaxPooling1D(pool_size=35)(cnn3)
cnn3 = Flatten()(cnn3)
merge = concatenate([cnn1, cnn2, cnn3], axis=1)
x = Dense(128, activation='relu')(merge)
x = Dropout(0.5)(x)
preds = Dense(2, activation='softmax')(x)
model = Model(sequence_input, preds)
model.compile(
optimizer=Adam(),
loss=categorical_crossentropy,
metrics=['accuracy']
)
model.fit(x_train, y_train, batch_size=256, epochs=10, validation_data=(x_test, y_test))
model.save('model/nlp_cnn.h5')
def I_think(content):
cw = list(jieba.cut(content))
word_id = []
for word in cw:
try:
temp = dict_text[word]
word_id.append(temp)
except:
word_id.append(0)
word_id = np.array(word_id)
word_id = word_id[np.newaxis, :]
sequences = pad_sequences(word_id, maxlen=1700, padding='post')
#result = np.argmax(model.predict(sequences))
result = model.predict(sequences)
if result[0][1] < 0.3:
print(f"欢迎下次购买~({result[0][1]})")
elif result[0][1] > 0.3:
print(f"不喜欢就滚蛋!({result[0][1]})")
数据集可以在Kaggle上找到