Keras CNN自然语言处理

最新推荐文章于 2024-10-04 11:19:03 发布

WaYuTo

最新推荐文章于 2024-10-04 11:19:03 发布

阅读量241

点赞数 4

文章标签： keras cnn 自然语言处理

本文链接：https://blog.csdn.net/pymluser/article/details/139390149

版权

Keras官网：Keras: Deep Learning for humans

Keras中文手册：主页 - Keras 中文文档

Keras安装：

pip install keras --break-system #不推荐

Arch Linux:
sudo pacman -S python-keras

Debian:
sudo apt install python-keras

RHEL:
sudo yum install python-keras

注：需要先下载Tensorflow或Theano作为后端，这里是用Tensorflow

首先，导入需要用到的库：

from tensorflow import keras
from tensorflow.keras.layers import *
from tensorflow.keras.optimizers import *
from tensorflow.keras.losses import *
from tensorflow.keras.models import *
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

import numpy as np
import pandas as pd
import jieba

6-7行图像预处理必须使用Tensorflow导入，Keras没有这个模块

使用Arch Linux如果想要使用Nvidia显卡作为加速的话，需要先下载驱动和cuda

sudo pacman -S nvidia cuda
reboot

然后导入数据：

#使用Pandas读取数据
data = pd.read_csv('datasets/earphones.csv')

#使用jieba切割数据
cw = lambda x: list(jieba.cut(x))
data['content'] = data['content'].apply(cw)

#将每个词用Space分割
texts = [' '.join(x) for x in data['content']]

这是可以测试一下:

max_doc_leg = max(len(x) for x in data['content'])
print(max_doc_leg)

print(texts[2])

可以得到如下结果:

1651
'达音科   17 周年   倒 是 数据 最 好看 ， 而且 便宜'

然后是进一步对数据进行预处理：

tokenizer = Tokenizer(num_words=30000)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
sequences = pad_sequences(sequences, maxlen=1700, padding='post')
sequences = np.array(sequences)
dict_text = tokenizer.word_index

poslen, neglen = 0, 0
leg = data['sentiment_value']
for i in range(len(data['sentiment_value'])):
    if leg[i] == 1:
        poslen += 1
    else:
        neglen += 1

positive_label = [[0, 1] for _ in range(poslen)]
negative_label = [[1, 0] for _ in range(neglen)]
y = np.concatenate([positive_label, negative_label], 0)

np.random.seed(int(time()))
shuffle_indices = np.random.permutation(np.arange(len(y)))

x_shuffled = sequences[shuffle_indices]
y_shuffled = y[shuffle_indices]

test_sample_index = -1 * int(0.1 * float(len(y)))
x_train, x_test = x_shuffled[:test_sample_index], x_shuffled[test_sample_index:]
y_train, y_test = y_shuffled[:test_sample_index], y_shuffled[test_sample_index:]

然后构建卷积网络：

sequence_input = Input(shape=(1700, ))
embedding_layer = Embedding(30000, 128, input_length=1700)
embedding_sequences = embedding_layer(sequence_input)

cnn1 = Conv1D(filters=32, kernel_size=3, activation='relu')(embedding_sequences)
cnn1 = MaxPooling1D(pool_size=5)(cnn1)
cnn1 = Conv1D(filters=32, kernel_size=3, activation='relu')(cnn1)
cnn1 = MaxPooling1D(pool_size=5)(cnn1)
cnn1 = Conv1D(filters=32, kernel_size=3, activation='relu')(cnn1)
cnn1 = MaxPooling1D(pool_size=37)(cnn1)
cnn1 = Flatten()(cnn1)

cnn2 = Conv1D(filters=32, kernel_size=4, activation='relu')(embedding_sequences)
cnn2 = MaxPooling1D(pool_size=5)(cnn2)
cnn2 = Conv1D(filters=32, kernel_size=4, activation='relu')(cnn2)
cnn2 = MaxPooling1D(pool_size=5)(cnn2)
cnn2 = Conv1D(filters=32, kernel_size=4, activation='relu')(cnn2)
cnn2 = MaxPooling1D(pool_size=36)(cnn2)
cnn2 = Flatten()(cnn2)

cnn3 = Conv1D(filters=32, kernel_size=5, activation='relu')(embedding_sequences)
cnn3 = MaxPooling1D(pool_size=5)(cnn3)
cnn3 = Conv1D(filters=32, kernel_size=5, activation='relu')(cnn3)
cnn3 = MaxPooling1D(pool_size=5)(cnn3)
cnn3 = Conv1D(filters=32, kernel_size=5, activation='relu')(cnn3)
cnn3 = MaxPooling1D(pool_size=35)(cnn3)
cnn3 = Flatten()(cnn3)

merge = concatenate([cnn1, cnn2, cnn3], axis=1)
x = Dense(128, activation='relu')(merge)
x = Dropout(0.5)(x)
preds = Dense(2, activation='softmax')(x)
model = Model(sequence_input, preds)

这里是用函数式模型，当然你也可以用顺序模型

model = Sequential(
    Dense(), 
    Dense()
    ...
)

最后就是编译并训练模型了：

model.compile(
    optimizer=Adam(),
    loss=categorical_crossentropy, 
    metrics=['accuracy']
)
model.fit(x_train, y_train, batch_size=256, epochs=10, validation_data=(x_test, y_test))

这里使用Adam作为优化器，损失函数为交叉熵，也可以用均方误差(mse)

由于数据很多，而且Jupyter内找不到GPU，所以只能用CPU，发热严重，且训练缓慢

大概10min后，运行就完成了，我们可以定义一个函数来进行预测：

def I_think(content):
    cw = list(jieba.cut(content))
    word_id = []
    for word in cw:
        try:
            temp = dict_text[word]
            word_id.append(temp)
        except:
            word_id.append(0)
    word_id = np.array(word_id)
    word_id = word_id[np.newaxis, :]
    sequences = pad_sequences(word_id, maxlen=1700, padding='post')
    #result = np.argmax(model.predict(sequences))
    result = model.predict(sequences)
    if result[0][1] < 0.3:
        print(f"欢迎下次购买~({result[0][1]})")
    elif result[0][1] > 0.3:
        print(f"不喜欢就滚蛋!({result[0][1]})")

调用这个函数：

I_think('音质很好，推荐购买')

结果：
1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 18ms/step
欢迎下次购买~(5.5004918976919726e-05)

可以使用save保存模型，下次运行就不用训练了：

model.save('path/to/nlp_cnn.h5')

下次使用时，把编译和训练内容换成以下内容即可

model = load_model('path/to/nlp_cnn.h5')

最后是完整代码:

from tensorflow import keras
from tensorflow.keras.layers import *
from tensorflow.keras.optimizers import *
from tensorflow.keras.losses import *
from tensorflow.keras.models import *
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from time import time
import numpy as np
import pandas as pd
import jieba


data = pd.read_csv('datasets/earphones.csv')
cw = lambda x: list(jieba.cut(x))
data['content'] = data['content'].apply(cw)
texts = [' '.join(x) for x in data['content']]

tokenizer = Tokenizer(num_words=30000)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
sequences = pad_sequences(sequences, maxlen=1700, padding='post')
sequences = np.array(sequences)
dict_text = tokenizer.word_index

np.random.seed(int(time()))
shuffle_indices = np.random.permutation(np.arange(len(y)))

x_shuffled = sequences[shuffle_indices]
y_shuffled = y[shuffle_indices]

test_sample_index = -1 * int(0.1 * float(len(y)))
x_train, x_test = x_shuffled[:test_sample_index], x_shuffled[test_sample_index:]
y_train, y_test = y_shuffled[:test_sample_index], y_shuffled[test_sample_index:]

sequence_input = Input(shape=(1700, ))
embedding_layer = Embedding(30000, 128, input_length=1700)
embedding_sequences = embedding_layer(sequence_input)

cnn1 = Conv1D(filters=32, kernel_size=3, activation='relu')(embedding_sequences)
cnn1 = MaxPooling1D(pool_size=5)(cnn1)
cnn1 = Conv1D(filters=32, kernel_size=3, activation='relu')(cnn1)
cnn1 = MaxPooling1D(pool_size=5)(cnn1)
cnn1 = Conv1D(filters=32, kernel_size=3, activation='relu')(cnn1)
cnn1 = MaxPooling1D(pool_size=37)(cnn1)
cnn1 = Flatten()(cnn1)

cnn2 = Conv1D(filters=32, kernel_size=4, activation='relu')(embedding_sequences)
cnn2 = MaxPooling1D(pool_size=5)(cnn2)
cnn2 = Conv1D(filters=32, kernel_size=4, activation='relu')(cnn2)
cnn2 = MaxPooling1D(pool_size=5)(cnn2)
cnn2 = Conv1D(filters=32, kernel_size=4, activation='relu')(cnn2)
cnn2 = MaxPooling1D(pool_size=36)(cnn2)
cnn2 = Flatten()(cnn2)

cnn3 = Conv1D(filters=32, kernel_size=5, activation='relu')(embedding_sequences)
cnn3 = MaxPooling1D(pool_size=5)(cnn3)
cnn3 = Conv1D(filters=32, kernel_size=5, activation='relu')(cnn3)
cnn3 = MaxPooling1D(pool_size=5)(cnn3)
cnn3 = Conv1D(filters=32, kernel_size=5, activation='relu')(cnn3)
cnn3 = MaxPooling1D(pool_size=35)(cnn3)
cnn3 = Flatten()(cnn3)

merge = concatenate([cnn1, cnn2, cnn3], axis=1)
x = Dense(128, activation='relu')(merge)
x = Dropout(0.5)(x)
preds = Dense(2, activation='softmax')(x)
model = Model(sequence_input, preds)

model.compile(
    optimizer=Adam(),
    loss=categorical_crossentropy, 
    metrics=['accuracy']
)
model.fit(x_train, y_train, batch_size=256, epochs=10, validation_data=(x_test, y_test))
model.save('model/nlp_cnn.h5')

def I_think(content):
    cw = list(jieba.cut(content))
    word_id = []
    for word in cw:
        try:
            temp = dict_text[word]
            word_id.append(temp)
        except:
            word_id.append(0)
    word_id = np.array(word_id)
    word_id = word_id[np.newaxis, :]
    sequences = pad_sequences(word_id, maxlen=1700, padding='post')
    #result = np.argmax(model.predict(sequences))
    result = model.predict(sequences)
    if result[0][1] < 0.3:
        print(f"欢迎下次购买~({result[0][1]})")
    elif result[0][1] > 0.3:
        print(f"不喜欢就滚蛋!({result[0][1]})")

数据集可以在Kaggle上找到