#### 任务七:使用Word2Vec词向量,搭建TextCNN模型进行训练和预测
- 说明:在这个任务中,你将使用Word2Vec词向量,搭建TextCNN模型进行文本分类的训练和预测,通过卷积神经网络来进行文本分类。
- 实践步骤:
- 准备Word2Vec词向量模型和相应的训练数据集。
- 构建TextCNN模型,包括卷积层、池化层、全连接层等。
- 将Word2Vec词向量应用到模型中,作为词特征的输入。
- 使用训练数据集对TextCNN模型进行训练。
- 使用训练好的TextCNN模型对测试数据集进行预测。
from sklearn.model_selection import train_test_split
import pandas as pd
import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from gensim.models.word2vec import Word2Vec
from keras import *
from keras.layers import *
import numpy as np
train_data = pd.read_csv('./ChatGPT生成文本检测器公开数据-更新/train.csv')
test_data = pd.read_csv('./ChatGPT生成文本检测器公开数据-更新/test.csv')
train_data['char_count'] = train_data['content'].apply(lambda x:len(x.split(' ')))
# 对输入的内容进行处理
train_labels = train_data['label']
train_data['content'] = train_data['content'].apply(lambda x: x[1:-1].strip().replace('\n', ' \n '))
test_data['content'] = test_data['content'].apply(lambda x: x[1:-1].strip().replace('\n', ' \n '))
train_data['content'] = train_data['content'].apply(lambda x: x.split(' '))
test_data['content'] = test_data['content'].apply(lambda x: x.split(' '))
train_data['content'] = train_data['content'].apply(lambda x: [i for i in x if i != '' and i != '\n'])
test_data['content'] = test_data['content'].apply(lambda x: [i for i in x if i != '' and i != '\n'])
train_data = train_data['content']
test_data = test_data['content']
x_train, x_test, y_train, y_test = train_test_split(train_data, train_labels, test_size=0.1)
# 每条样本长度不唯一,将每条样本的长度设置一个固定值
x_train_padded_seqs=pad_sequences(x_train,maxlen=200) #将超过固定值的部分截掉,不足的在最前面用0填充
x_test_padded_seqs=pad_sequences(x_test, maxlen=200)
x_test_padded_seqs_out = pad_sequences(test_data, maxlen=200)
# print(x_train_padded_seqs[0])
w2v_model = Word2Vec.load('word2vec.model')
vocab = w2v_model.wv.key_to_index
print(len(vocab))
# 预训练的词向量中没有出现的词用0向量表示
embedding_matrix = np.zeros((len(vocab) + 1, 100))
for word, i in vocab.items():
try:
embedding_vector = w2v_model.wv[str(word)]
embedding_matrix[i] = embedding_vector
except KeyError:
continue
#构建TextCNN模型
def TextCNN_model_2(x_train_padded_seqs,y_train,x_test_padded_seqs,y_test,embedding_matrix,x_test_padded_seqs_out):
# 模型结构:词嵌入-卷积池化*3-拼接-全连接-dropout-全连接
main_input = keras.Input(shape=(200,), dtype='float64')
# 词嵌入(使用预训练的词向量)
embedder = Embedding(len(vocab) + 1, 100, input_length=200, weights=[embedding_matrix], trainable=False)
#embedder = Embedding(len(vocab) + 1, 300, input_length=50, trainable=False)
embed = embedder(main_input)
# 词窗大小分别为3,4,5
cnn1 = Conv1D(256, 3, padding='same', strides=1, activation='relu')(embed)
cnn1 = MaxPooling1D(pool_size=38)(cnn1)
cnn2 = Conv1D(256, 4, padding='same', strides=1, activation='relu')(embed)
cnn2 = MaxPooling1D(pool_size=37)(cnn2)
cnn3 = Conv1D(256, 5, padding='same', strides=1, activation='relu')(embed)
cnn3 = MaxPooling1D(pool_size=36)(cnn3)
# 合并三个模型的输出向量
cnn = concatenate([cnn1, cnn2, cnn3], axis=-1)
flat = Flatten()(cnn)
drop = Dropout(0.2)(flat)
main_output = Dense(2, activation='softmax')(drop)
model = Model(inputs=main_input, outputs=main_output)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
one_hot_labels = keras.utils.to_categorical(y_train, num_classes=2) # 将标签转换为one-hot编码
model.fit(x_train_padded_seqs, one_hot_labels, batch_size=800, epochs=5)
from keras.models import load_model
#模型的保存
model.save('model.h5')
#y_test_onehot = keras.utils.to_categorical(y_test, num_classes=3) # 将标签转换为one-hot编码
result = model.predict(x_test_padded_seqs) # 预测样本属于每个类别的概率
result_labels = np.argmax(result, axis=1) # 获得最大概率对应的标签
y_predict = list(map(int, result_labels))
# 预测测试数据集的分类结果
predictions = model.predict(x_test_padded_seqs_out)
predicted_labels = predictions.argmax(axis=1)
# 读取提交样例文件
submit = pd.read_csv('./sample_submit.csv')
submit = submit.sort_values(by='name')
# 将预测结果赋值给提交文件的label列
submit['label'] = predicted_labels
# 保存提交文件
submit.to_csv('./textcnn.csv', index=None)
TextCNN_model_2(x_train_padded_seqs,y_train,x_test_padded_seqs,y_test,embedding_matrix,x_test_padded_seqs_out)
效果一般