Keras CNN自然语言处理

Keras官网:Keras: Deep Learning for humans

Keras中文手册:主页 - Keras 中文文档

Keras安装:

pip install keras --break-system #不推荐

Arch Linux:
sudo pacman -S python-keras

Debian:
sudo apt install python-keras

RHEL:
sudo yum install python-keras

注:需要先下载Tensorflow或Theano作为后端,这里是用Tensorflow

首先,导入需要用到的库:

from tensorflow import keras
from tensorflow.keras.layers import *
from tensorflow.keras.optimizers import *
from tensorflow.keras.losses import *
from tensorflow.keras.models import *
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

import numpy as np
import pandas as pd
import jieba

6-7行图像预处理必须使用Tensorflow导入,Keras没有这个模块

使用Arch Linux如果想要使用Nvidia显卡作为加速的话,需要先下载驱动和cuda

sudo pacman -S nvidia cuda
reboot

然后导入数据:

#使用Pandas读取数据
data = pd.read_csv('datasets/earphones.csv')

#使用jieba切割数据
cw = lambda x: list(jieba.cut(x))
data['content'] = data['content'].apply(cw)

#将每个词用Space分割
texts = [' '.join(x) for x in data['content']]

这是可以测试一下:

max_doc_leg = max(len(x) for x in data['content'])
print(max_doc_leg)

print(texts[2])

可以得到如下结果:

1651
'达音科   17 周年   倒 是 数据 最 好看 , 而且 便宜'

 

然后是进一步对数据进行预处理:

tokenizer = Tokenizer(num_words=30000)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
sequences = pad_sequences(sequences, maxlen=1700, padding='post')
sequences = np.array(sequences)
dict_text = tokenizer.word_index

poslen, neglen = 0, 0
leg = data['sentiment_value']
for i in range(len(data['sentiment_value'])):
    if leg[i] == 1:
        poslen += 1
    else:
        neglen += 1

positive_label = [[0, 1] for _ in range(poslen)]
negative_label = [[1, 0] for _ in range(neglen)]
y = np.concatenate([positive_label, negative_label], 0)

np.random.seed(int(time()))
shuffle_indices = np.random.permutation(np.arange(len(y)))

x_shuffled = sequences[shuffle_indices]
y_shuffled = y[shuffle_indices]

test_sample_index = -1 * int(0.1 * float(len(y)))
x_train, x_test = x_shuffled[:test_sample_index], x_shuffled[test_sample_index:]
y_train, y_test = y_shuffled[:test_sample_index], y_shuffled[test_sample_index:]

然后构建卷积网络:

sequence_input = Input(shape=(1700, ))
embedding_layer = Embedding(30000, 128, input_length=1700)
embedding_sequences = embedding_layer(sequence_input)

cnn1 = Conv1D(filters=32, kernel_size=3, activation='relu')(embedding_sequences)
cnn1 = MaxPooling1D(pool_size=5)(cnn1)
cnn1 = Conv1D(filters=32, kernel_size=3, activation='relu')(cnn1)
cnn1 = MaxPooling1D(pool_size=5)(cnn1)
cnn1 = Conv1D(filters=32, kernel_size=3, activation='relu')(cnn1)
cnn1 = MaxPooling1D(pool_size=37)(cnn1)
cnn1 = Flatten()(cnn1)

cnn2 = Conv1D(filters=32, kernel_size=4, activation='relu')(embedding_sequences)
cnn2 = MaxPooling1D(pool_size=5)(cnn2)
cnn2 = Conv1D(filters=32, kernel_size=4, activation='relu')(cnn2)
cnn2 = MaxPooling1D(pool_size=5)(cnn2)
cnn2 = Conv1D(filters=32, kernel_size=4, activation='relu')(cnn2)
cnn2 = MaxPooling1D(pool_size=36)(cnn2)
cnn2 = Flatten()(cnn2)

cnn3 = Conv1D(filters=32, kernel_size=5, activation='relu')(embedding_sequences)
cnn3 = MaxPooling1D(pool_size=5)(cnn3)
cnn3 = Conv1D(filters=32, kernel_size=5, activation='relu')(cnn3)
cnn3 = MaxPooling1D(pool_size=5)(cnn3)
cnn3 = Conv1D(filters=32, kernel_size=5, activation='relu')(cnn3)
cnn3 = MaxPooling1D(pool_size=35)(cnn3)
cnn3 = Flatten()(cnn3)

merge = concatenate([cnn1, cnn2, cnn3], axis=1)
x = Dense(128, activation='relu')(merge)
x = Dropout(0.5)(x)
preds = Dense(2, activation='softmax')(x)
model = Model(sequence_input, preds)

这里是用函数式模型,当然你也可以用顺序模型

model = Sequential(
    Dense(), 
    Dense()
    ...
)

最后就是编译并训练模型了:

model.compile(
    optimizer=Adam(),
    loss=categorical_crossentropy, 
    metrics=['accuracy']
)
model.fit(x_train, y_train, batch_size=256, epochs=10, validation_data=(x_test, y_test))

这里使用Adam作为优化器,损失函数为交叉熵,也可以用均方误差(mse)

由于数据很多,而且Jupyter内找不到GPU,所以只能用CPU,发热严重,且训练缓慢

大概10min后,运行就完成了,我们可以定义一个函数来进行预测:

def I_think(content):
    cw = list(jieba.cut(content))
    word_id = []
    for word in cw:
        try:
            temp = dict_text[word]
            word_id.append(temp)
        except:
            word_id.append(0)
    word_id = np.array(word_id)
    word_id = word_id[np.newaxis, :]
    sequences = pad_sequences(word_id, maxlen=1700, padding='post')
    #result = np.argmax(model.predict(sequences))
    result = model.predict(sequences)
    if result[0][1] < 0.3:
        print(f"欢迎下次购买~({result[0][1]})")
    elif result[0][1] > 0.3:
        print(f"不喜欢就滚蛋!({result[0][1]})")

调用这个函数:

I_think('音质很好,推荐购买')

结果:
1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 18ms/step
欢迎下次购买~(5.5004918976919726e-05)

可以使用save保存模型,下次运行就不用训练了:

model.save('path/to/nlp_cnn.h5')

下次使用时,把编译和训练内容换成以下内容即可

model = load_model('path/to/nlp_cnn.h5')

最后是完整代码:

from tensorflow import keras
from tensorflow.keras.layers import *
from tensorflow.keras.optimizers import *
from tensorflow.keras.losses import *
from tensorflow.keras.models import *
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from time import time
import numpy as np
import pandas as pd
import jieba


data = pd.read_csv('datasets/earphones.csv')
cw = lambda x: list(jieba.cut(x))
data['content'] = data['content'].apply(cw)
texts = [' '.join(x) for x in data['content']]

tokenizer = Tokenizer(num_words=30000)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
sequences = pad_sequences(sequences, maxlen=1700, padding='post')
sequences = np.array(sequences)
dict_text = tokenizer.word_index

np.random.seed(int(time()))
shuffle_indices = np.random.permutation(np.arange(len(y)))

x_shuffled = sequences[shuffle_indices]
y_shuffled = y[shuffle_indices]

test_sample_index = -1 * int(0.1 * float(len(y)))
x_train, x_test = x_shuffled[:test_sample_index], x_shuffled[test_sample_index:]
y_train, y_test = y_shuffled[:test_sample_index], y_shuffled[test_sample_index:]

sequence_input = Input(shape=(1700, ))
embedding_layer = Embedding(30000, 128, input_length=1700)
embedding_sequences = embedding_layer(sequence_input)

cnn1 = Conv1D(filters=32, kernel_size=3, activation='relu')(embedding_sequences)
cnn1 = MaxPooling1D(pool_size=5)(cnn1)
cnn1 = Conv1D(filters=32, kernel_size=3, activation='relu')(cnn1)
cnn1 = MaxPooling1D(pool_size=5)(cnn1)
cnn1 = Conv1D(filters=32, kernel_size=3, activation='relu')(cnn1)
cnn1 = MaxPooling1D(pool_size=37)(cnn1)
cnn1 = Flatten()(cnn1)

cnn2 = Conv1D(filters=32, kernel_size=4, activation='relu')(embedding_sequences)
cnn2 = MaxPooling1D(pool_size=5)(cnn2)
cnn2 = Conv1D(filters=32, kernel_size=4, activation='relu')(cnn2)
cnn2 = MaxPooling1D(pool_size=5)(cnn2)
cnn2 = Conv1D(filters=32, kernel_size=4, activation='relu')(cnn2)
cnn2 = MaxPooling1D(pool_size=36)(cnn2)
cnn2 = Flatten()(cnn2)

cnn3 = Conv1D(filters=32, kernel_size=5, activation='relu')(embedding_sequences)
cnn3 = MaxPooling1D(pool_size=5)(cnn3)
cnn3 = Conv1D(filters=32, kernel_size=5, activation='relu')(cnn3)
cnn3 = MaxPooling1D(pool_size=5)(cnn3)
cnn3 = Conv1D(filters=32, kernel_size=5, activation='relu')(cnn3)
cnn3 = MaxPooling1D(pool_size=35)(cnn3)
cnn3 = Flatten()(cnn3)

merge = concatenate([cnn1, cnn2, cnn3], axis=1)
x = Dense(128, activation='relu')(merge)
x = Dropout(0.5)(x)
preds = Dense(2, activation='softmax')(x)
model = Model(sequence_input, preds)

model.compile(
    optimizer=Adam(),
    loss=categorical_crossentropy, 
    metrics=['accuracy']
)
model.fit(x_train, y_train, batch_size=256, epochs=10, validation_data=(x_test, y_test))
model.save('model/nlp_cnn.h5')

def I_think(content):
    cw = list(jieba.cut(content))
    word_id = []
    for word in cw:
        try:
            temp = dict_text[word]
            word_id.append(temp)
        except:
            word_id.append(0)
    word_id = np.array(word_id)
    word_id = word_id[np.newaxis, :]
    sequences = pad_sequences(word_id, maxlen=1700, padding='post')
    #result = np.argmax(model.predict(sequences))
    result = model.predict(sequences)
    if result[0][1] < 0.3:
        print(f"欢迎下次购买~({result[0][1]})")
    elif result[0][1] > 0.3:
        print(f"不喜欢就滚蛋!({result[0][1]})")

数据集可以在Kaggle上找到

  • 4
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值