Tenserflow 情感分类

1. 背景说明

       在Pytorch 实现情感分类版本基础上进行tensorflow实现。

2. 加载数据

2.1 加载停用词

import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')  # 下载停用词
stop_words = stopwords.words('english')
print(stop_words)

2.2 文本预处理

1. 词干提取+词形还原

2. 删除"超链接+@某人"

3. 删除停用词

import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer  # 提取词干

def preprocessing(text, stem=False):
    stop_words = stopwords.words('english')  # 停用词
    stemmmer = SnowballStemmer('english')    # 词干
    text_cleaning_re = '@\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+'  # 正则表达式

    text = re.sub(text_cleaning_re, ' ', str(text).lower()).strip()
    tokens = []
    for token in text.split():
        if token not in stop_words:
            if stem:
                tokens.append(stemmmer.stem(token))  # 提取词干
            else:
                tokens.append(token)
    return ' '.join(tokens)

if __name__ == '__main__':
    nltk.download('stopwords')  # 下载停用词
    print(df.text[2])
    # @Kenichan I dived many times for the ball. Managed to save 50%  The rest go out of bounds
    df.text = df.text.apply(lambda x: preprocessing(x))
    print(df.text[2])
    # dived many times ball managed save 50 rest go bounds

2.3 分割训练集、测试集

清洗文本,并进行训练集、测试集分割。

from sklearn.model_selection import train_test_split

def load_split_dataset(data_path, train_data_path, test_data_path):
    df = pd.read_csv(data_path, engine='python', header=None, encoding='utf-8')
    df.columns = ['sentiment', 'id', 'date', 'query', 'user_id', 'text']
    df = df.drop(['id', 'date', 'query', 'user_id'], axis=1)
    # 清洗文本
    df.text = df.text.apply(lambda x: preprocessing(x))
    train_data, test_data = train_test_split(df, test_size=0.2, random_state=666, shuffle=True)
    # print(train_data.shape)  # (1280000, 2)
    # print(test_data.shape)  # (320000, 2)
    train_data.to_csv(train_data_path, index=0, sep='\t')
    test_data.to_csv(test_data_path, index=0, sep='\t')

2.4 计算词汇表大小

针对训练集统计

def get_vocab_size(tokenizer, train_data_path):
    train_df = pd.read_csv(train_data_path, engine='python', header=None)
    tokenizer.fit_on_texts(train_df.text)
    # 每个单词对应一个索引
    word_index = tokenizer.word_index
    vocab_size = len(word_index) + 1  # 训练集词汇表大小
    print(vocab_size)  # 290684
    return vocab_size

2.5 训练集测试集预处理

from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

def get_processed_train_test_data(tokenizer, train_data_path, test_data_path, max_seq_length):
    # max_seq_length = 30  # 最大序列长度30
    train_df = pd.read_csv(train_data_path, engine='python', header=None)
    test_df = pd.read_csv(test_data_path, engine='python', header=None)

    # 固定每一条文本的长度
    X_train = pad_sequences(tokenizer.texts_to_sequences(train_df.text), maxlen=max_seq_length)
    X_test = pad_sequences(tokenizer.texts_to_sequences(test_df.text), maxlen=max_seq_length)
    print(X_train.shape)  # (1280000, 30)
    print(X_test.shape)   # (320000, 30)

    # 类别编码
    encoder = LabelEncoder()
    y_train = encoder.fit(train_df.sentiment.tolist())
    y_test = encoder.fit(test_df.sentiment.tolist())
    y_train = y_train.reshape(-1, 1)  # 转置
    y_test = y_test.reshape(-1, 1)
    print(y_train.shape)   # (1280000, 1)
    print(y_test.shape)    # (320000, 1)

    return X_train, X_test, y_train, y_test

2.6 数据预处理pipeline

def main():
    DATA_PATH = '../data/training.1600000.processed.noemoticon.csv'
    TRAIN_DATA_PATH = '../data/train_data.csv'
    TEST_DATA_PATH = '../data/test_data.csv'
    MAX_SEQ_LENGTH = 30  # 最大序列长度30

    # 1.对数据集进行预处理,分割训练集、测试集后,进行存储
    nltk.download('stopwords')  # 下载停用词
    load_split_dataset(DATA_PATH, TRAIN_DATA_PATH, TEST_DATA_PATH)

    # 2.加载训练集、测试集,并进行预处理
    tokenizer = Tokenizer()
    X_train, X_test, y_train, y_test = get_processed_train_test_data(tokenizer, TRAIN_DATA_PATH, TEST_DATA_PATH, MAX_SEQ_LENGTH)
    vocab_size = get_vocab_size(tokenizer, TRAIN_DATA_PATH)

3. 词嵌入处理

3.1 构建词嵌入字典

# 构建词嵌入字典 {单词:词嵌入向量}
def get_word_embedding_dict(glove_path):
    embedding_dict = {}
    with open(glove_path) as f:
        for line in f:
            temp_list = line.split()  # 按空格分隔
            word = temp_list[0]       # 第一个位置上是单词
            embeddings = np.asarray(temp_list[1:], dtype='float32')  # 每个单词对应的词嵌入
            embedding_dict[word] = embeddings
    print(len(embedding_dict))  # 40W个单词
    return embedding_dict

3.2 构建词嵌入矩阵

def get_word_embedding_matrix(embedding_dict, vocab_size, embedding_dim):
    embedding_matrix = np.zeros((vocab_size, embedding_dim))
    num = 0
    for word, embedding_vector in embedding_dict.items():
        if embedding_vector is not None:
            if num < vocab_size:
                embedding_matrix[num, :] = embedding_vector
            num += 1
    print(embedding_matrix.shape)  # (290684, 300)
    return embedding_matrix

4. 模型构建

4.1 搭建双向lstm模型

from tensorflow.keras.layers import Conv1D, Bidirectional, LSTM, Dense, Input, Dropout, SpatialDropout1D
from tensorflow.keras import Model

def get_biLstm_model(embedding_layer, max_seq_length):
    sequence_input = Input(shape=(max_seq_length,), dtype='int32')
    embedding_sequences = embedding_layer(sequence_input)

    # 丢弃整个1D的特征图而不是丢弃单个元素,提高特征图之间的独立性。
    x = SpatialDropout1D(0.2)(embedding_sequences)
    print(x.shape)  # (None, 30, 300)
    x = Conv1D(64, 5, activation='relu')(x)  # (None, 26, 64)
    x = Bidirectional(LSTM(64, dropout=0.2, recurrent_dropout=0.2))(x)  # (None, 128)
    x = Dense(512, activation='relu')(x)  # (None, 512)
    x = Dropout(0.5)(x)
    x = Dense(512, activation='relu')(x)  # (None, 512)
    outputs = Dense(1, activation='sigmoid')(x)  # (None, 1)
    model = Model(sequence_input, outputs)
    return model

4.2 单向lstm模型

from tensorflow.keras import Sequential

def get_lstm(embedding_layer):
    model_lstm = Sequential()
    model_lstm.add(embedding_layer)
    model_lstm.add(Dropout(0.5))
    model_lstm.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
    model_lstm.add(Dense(1, activation='sigmoid'))
    model_lstm.summary()
    return model_lstm

5. 模型预测

5.1 模型预测分类

def model_predict(model, x_test):
    scores = model.predict(x_test, batch_size=10000, verbose=1)
    y_pred = [1 if (score > 0.5) else 0 for score in scores]
    return y_pred

5.2 超参数定义

TRAIN_DATA_PATH = '../data/train_data.csv'
TEST_DATA_PATH = '../data/test_data.csv'
GloVe_PATH = '../model/glove.6B.300d.txt'
MODEL_PATH = '../model/best_model.hdf5'
MAX_WORDS = 100000  # 最大词汇量10W
EPOCHS = 10
BATCH_SIZE = 10000
EMBEDDING_DIM = 300
LR = 0.001
MAX_SEQ_LENGTH = 30  # 最大序列长度30

5.3 模型pipeline

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.callbacks import ModelCheckpoint, ReduceLROnPlateau
from tensorflow.keras.optimizers import Adam

def main():
    # 1.加载训练集、测试集,并进行预处理
    tokenizer = Tokenizer()
    X_train, X_test, y_train, y_test = get_processed_train_test_data(tokenizer, TRAIN_DATA_PATH, TEST_DATA_PATH, MAX_SEQ_LENGTH)
    vocab_size = get_vocab_size(tokenizer, TRAIN_DATA_PATH)

    # 2.Word Embedding词嵌入:将单词用特征向量表示,这里使用(600W个单词的)预训练的词向量GloVe
    # 2.1 构建词嵌入字典
    embedding_dict = get_word_embedding_dict(GloVe_PATH)
    # 2.2 构建词嵌入矩阵
    embedding_matrix = get_word_embedding_matrix(embedding_dict, vocab_size, EMBEDDING_DIM)

    # 3. 搭建双向lstm模型
    embedding_layer = Embedding(vocab_size,
                                EMBEDDING_DIM,
                                weights=[embedding_matrix],
                                input_length=MAX_SEQ_LENGTH,
                                trainable=False)
    model = get_biLstm_model(embedding_layer, MAX_SEQ_LENGTH)
    model.compile(optimizer=Adam(learning_rate=LR),
                  loss='binary_crossentropy',
                  metrics=['accuracy'])
    # factor为学习率降低因子,其中lr_new=lr*factor, 下边界是min_lr
    reduceLR = ReduceLROnPlateau(factor=0.1, min_lr=0.01, monitor='val_loss', verbose=1)
    history = model.fit(X_train, y_train, batch_size=BATCH_SIZE, epochs=EPOCHS,
                        validation_data=(X_test, y_test), callbacks=[reduceLR])

    # 4. 模型预测分类
    pred = model_predict(model, X_test)

5.4 单样本预测

def predict(model, tokenizer, input_text, max_seq_length):
    text_tokens = pad_sequences(tokenizer.text_to_sequences([input_text]), maxlen=max_seq_length)
    score = model.predict([text_tokens])[0]
    return score

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值