1. 背景说明
在Pytorch 实现情感分类版本基础上进行tensorflow实现。
2. 加载数据
2.1 加载停用词
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords') # 下载停用词
stop_words = stopwords.words('english')
print(stop_words)
2.2 文本预处理
1. 词干提取+词形还原
2. 删除"超链接+@某人"
3. 删除停用词
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer # 提取词干
def preprocessing(text, stem=False):
stop_words = stopwords.words('english') # 停用词
stemmmer = SnowballStemmer('english') # 词干
text_cleaning_re = '@\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+' # 正则表达式
text = re.sub(text_cleaning_re, ' ', str(text).lower()).strip()
tokens = []
for token in text.split():
if token not in stop_words:
if stem:
tokens.append(stemmmer.stem(token)) # 提取词干
else:
tokens.append(token)
return ' '.join(tokens)
if __name__ == '__main__':
nltk.download('stopwords') # 下载停用词
print(df.text[2])
# @Kenichan I dived many times for the ball. Managed to save 50% The rest go out of bounds
df.text = df.text.apply(lambda x: preprocessing(x))
print(df.text[2])
# dived many times ball managed save 50 rest go bounds
2.3 分割训练集、测试集
清洗文本,并进行训练集、测试集分割。
from sklearn.model_selection import train_test_split
def load_split_dataset(data_path, train_data_path, test_data_path):
df = pd.read_csv(data_path, engine='python', header=None, encoding='utf-8')
df.columns = ['sentiment', 'id', 'date', 'query', 'user_id', 'text']
df = df.drop(['id', 'date', 'query', 'user_id'], axis=1)
# 清洗文本
df.text = df.text.apply(lambda x: preprocessing(x))
train_data, test_data = train_test_split(df, test_size=0.2, random_state=666, shuffle=True)
# print(train_data.shape) # (1280000, 2)
# print(test_data.shape) # (320000, 2)
train_data.to_csv(train_data_path, index=0, sep='\t')
test_data.to_csv(test_data_path, index=0, sep='\t')
2.4 计算词汇表大小
针对训练集统计
def get_vocab_size(tokenizer, train_data_path):
train_df = pd.read_csv(train_data_path, engine='python', header=None)
tokenizer.fit_on_texts(train_df.text)
# 每个单词对应一个索引
word_index = tokenizer.word_index
vocab_size = len(word_index) + 1 # 训练集词汇表大小
print(vocab_size) # 290684
return vocab_size
2.5 训练集测试集预处理
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
def get_processed_train_test_data(tokenizer, train_data_path, test_data_path, max_seq_length):
# max_seq_length = 30 # 最大序列长度30
train_df = pd.read_csv(train_data_path, engine='python', header=None)
test_df = pd.read_csv(test_data_path, engine='python', header=None)
# 固定每一条文本的长度
X_train = pad_sequences(tokenizer.texts_to_sequences(train_df.text), maxlen=max_seq_length)
X_test = pad_sequences(tokenizer.texts_to_sequences(test_df.text), maxlen=max_seq_length)
print(X_train.shape) # (1280000, 30)
print(X_test.shape) # (320000, 30)
# 类别编码
encoder = LabelEncoder()
y_train = encoder.fit(train_df.sentiment.tolist())
y_test = encoder.fit(test_df.sentiment.tolist())
y_train = y_train.reshape(-1, 1) # 转置
y_test = y_test.reshape(-1, 1)
print(y_train.shape) # (1280000, 1)
print(y_test.shape) # (320000, 1)
return X_train, X_test, y_train, y_test
2.6 数据预处理pipeline
def main():
DATA_PATH = '../data/training.1600000.processed.noemoticon.csv'
TRAIN_DATA_PATH = '../data/train_data.csv'
TEST_DATA_PATH = '../data/test_data.csv'
MAX_SEQ_LENGTH = 30 # 最大序列长度30
# 1.对数据集进行预处理,分割训练集、测试集后,进行存储
nltk.download('stopwords') # 下载停用词
load_split_dataset(DATA_PATH, TRAIN_DATA_PATH, TEST_DATA_PATH)
# 2.加载训练集、测试集,并进行预处理
tokenizer = Tokenizer()
X_train, X_test, y_train, y_test = get_processed_train_test_data(tokenizer, TRAIN_DATA_PATH, TEST_DATA_PATH, MAX_SEQ_LENGTH)
vocab_size = get_vocab_size(tokenizer, TRAIN_DATA_PATH)
3. 词嵌入处理
3.1 构建词嵌入字典
# 构建词嵌入字典 {单词:词嵌入向量}
def get_word_embedding_dict(glove_path):
embedding_dict = {}
with open(glove_path) as f:
for line in f:
temp_list = line.split() # 按空格分隔
word = temp_list[0] # 第一个位置上是单词
embeddings = np.asarray(temp_list[1:], dtype='float32') # 每个单词对应的词嵌入
embedding_dict[word] = embeddings
print(len(embedding_dict)) # 40W个单词
return embedding_dict
3.2 构建词嵌入矩阵
def get_word_embedding_matrix(embedding_dict, vocab_size, embedding_dim):
embedding_matrix = np.zeros((vocab_size, embedding_dim))
num = 0
for word, embedding_vector in embedding_dict.items():
if embedding_vector is not None:
if num < vocab_size:
embedding_matrix[num, :] = embedding_vector
num += 1
print(embedding_matrix.shape) # (290684, 300)
return embedding_matrix
4. 模型构建
4.1 搭建双向lstm模型
from tensorflow.keras.layers import Conv1D, Bidirectional, LSTM, Dense, Input, Dropout, SpatialDropout1D
from tensorflow.keras import Model
def get_biLstm_model(embedding_layer, max_seq_length):
sequence_input = Input(shape=(max_seq_length,), dtype='int32')
embedding_sequences = embedding_layer(sequence_input)
# 丢弃整个1D的特征图而不是丢弃单个元素,提高特征图之间的独立性。
x = SpatialDropout1D(0.2)(embedding_sequences)
print(x.shape) # (None, 30, 300)
x = Conv1D(64, 5, activation='relu')(x) # (None, 26, 64)
x = Bidirectional(LSTM(64, dropout=0.2, recurrent_dropout=0.2))(x) # (None, 128)
x = Dense(512, activation='relu')(x) # (None, 512)
x = Dropout(0.5)(x)
x = Dense(512, activation='relu')(x) # (None, 512)
outputs = Dense(1, activation='sigmoid')(x) # (None, 1)
model = Model(sequence_input, outputs)
return model
4.2 单向lstm模型
from tensorflow.keras import Sequential
def get_lstm(embedding_layer):
model_lstm = Sequential()
model_lstm.add(embedding_layer)
model_lstm.add(Dropout(0.5))
model_lstm.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model_lstm.add(Dense(1, activation='sigmoid'))
model_lstm.summary()
return model_lstm
5. 模型预测
5.1 模型预测分类
def model_predict(model, x_test):
scores = model.predict(x_test, batch_size=10000, verbose=1)
y_pred = [1 if (score > 0.5) else 0 for score in scores]
return y_pred
5.2 超参数定义
TRAIN_DATA_PATH = '../data/train_data.csv'
TEST_DATA_PATH = '../data/test_data.csv'
GloVe_PATH = '../model/glove.6B.300d.txt'
MODEL_PATH = '../model/best_model.hdf5'
MAX_WORDS = 100000 # 最大词汇量10W
EPOCHS = 10
BATCH_SIZE = 10000
EMBEDDING_DIM = 300
LR = 0.001
MAX_SEQ_LENGTH = 30 # 最大序列长度30
5.3 模型pipeline
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.callbacks import ModelCheckpoint, ReduceLROnPlateau
from tensorflow.keras.optimizers import Adam
def main():
# 1.加载训练集、测试集,并进行预处理
tokenizer = Tokenizer()
X_train, X_test, y_train, y_test = get_processed_train_test_data(tokenizer, TRAIN_DATA_PATH, TEST_DATA_PATH, MAX_SEQ_LENGTH)
vocab_size = get_vocab_size(tokenizer, TRAIN_DATA_PATH)
# 2.Word Embedding词嵌入:将单词用特征向量表示,这里使用(600W个单词的)预训练的词向量GloVe
# 2.1 构建词嵌入字典
embedding_dict = get_word_embedding_dict(GloVe_PATH)
# 2.2 构建词嵌入矩阵
embedding_matrix = get_word_embedding_matrix(embedding_dict, vocab_size, EMBEDDING_DIM)
# 3. 搭建双向lstm模型
embedding_layer = Embedding(vocab_size,
EMBEDDING_DIM,
weights=[embedding_matrix],
input_length=MAX_SEQ_LENGTH,
trainable=False)
model = get_biLstm_model(embedding_layer, MAX_SEQ_LENGTH)
model.compile(optimizer=Adam(learning_rate=LR),
loss='binary_crossentropy',
metrics=['accuracy'])
# factor为学习率降低因子,其中lr_new=lr*factor, 下边界是min_lr
reduceLR = ReduceLROnPlateau(factor=0.1, min_lr=0.01, monitor='val_loss', verbose=1)
history = model.fit(X_train, y_train, batch_size=BATCH_SIZE, epochs=EPOCHS,
validation_data=(X_test, y_test), callbacks=[reduceLR])
# 4. 模型预测分类
pred = model_predict(model, X_test)
5.4 单样本预测
def predict(model, tokenizer, input_text, max_seq_length):
text_tokens = pad_sequences(tokenizer.text_to_sequences([input_text]), maxlen=max_seq_length)
score = model.predict([text_tokens])[0]
return score