【自然语言处理】使用预训练词向量前的文本预处理

说明

环境:kaggle kernel;

数据来源于kaggle,需要手动添加;

import pandas as pd
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, CuDNNGRU, Conv1D
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers
from sklearn.model_selection import train_test_split
from gensim.models import KeyedVectors
from tqdm import tqdm
import operator 
import os
import re
import gc
tqdm.pandas()
Using TensorFlow backend.
train = pd.read_csv("../input/imdb-dataset/labeledTrainData.tsv", header=0,delimiter="\t", quoting=3)

参数

max_features = 10000
maxlen = 200
embed_size = 300

一、不做预处理直接使用预训练词向量

处理数据

# 建立tokenizer
tokenizer = Tokenizer(num_words=max_features,lower=True)
tokenizer.fit_on_texts(list(train['review']))
#word_index = tokenizer.word_index
x_train = tokenizer.texts_to_sequences(list(train['review']))
x_train = pad_sequences(x_train,maxlen=maxlen) # padding
y_train = list(train['sentiment'])
# 划分训练和验证集
x_train,x_val,y_train,y_val = train_test_split(x_train,y_train,test_size=0.3,random_state=0)

加载embedding

def load_word2vec(filename):
    word2vec = KeyedVectors.load_word2vec_format(filename, binary=True)
    embeddings_index = {}
    for i, vec in tqdm(enumerate(word2vec.wv.vectors)):
        embeddings_index[word2vec.wv.index2word[i]] = vec
    return embeddings_index
EMBEDDING_FILE = '../input/googlenewsvectorsnegative300/GoogleNews-vectors-negative300.bin.gz'
embeddings_index = load_word2vec(EMBEDDING_FILE)
/opt/conda/lib/python3.6/site-packages/ipykernel_launcher.py:4: DeprecationWarning: Call to deprecated `wv` (Attribute will be removed in 4.0.0, use self instead).
  after removing the cwd from sys.path.
0it [00:00, ?it/s]/opt/conda/lib/python3.6/site-packages/ipykernel_launcher.py:5: DeprecationWarning: Call to deprecated `wv` (Attribute will be removed in 4.0.0, use self instead).
  """
3000000it [00:13, 226113.62it/s]

构造embedding矩阵

def build_matrix(embeddings_index,word_index):
    embedding_matrix = np.zeros((max_features, 300))
    for word, i in tqdm(word_index.items()):
        if i >= max_features: continue
        try:
            # word对应的vector
            embedding_vector = embeddings_index[word]
        except:
            # word不存在则使用unknown的vector
            embedding_vector = embeddings_index["unknown"]
        if embedding_vector is not None:
            # 保证embedding_matrix行的向量与word_index中序号一致
            embedding_matrix[i] = embedding_vector
    return embedding_matrix
embedding_matrix = build_matrix(embeddings_index, tokenizer.word_index)
100%|██████████| 88582/88582 [00:00<00:00, 963944.40it/s]

建立模型

def build_model(embedding_matrix=None):
    inp = Input(shape=(maxlen,))
    if embedding_matrix is None:
        x = Embedding(max_features, embed_size)(inp)
    else:
        x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
    x = Bidirectional(CuDNNGRU(64, return_sequences=True))(x)
    x = GlobalMaxPool1D()(x)
    x = Dense(16, activation="relu")(x)
    x = Dropout(0.1)(x)
    x = Dense(1, activation="sigmoid")(x)
    model = Model(inputs=inp, outputs=x)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model
model = build_model(embedding_matrix)

训练

history = model.fit(x_train, y_train, batch_size=512, epochs=10, validation_data=(x_val, y_val))
Train on 17500 samples, validate on 7500 samples
Epoch 1/10
17500/17500 [==============================] - 2s 130us/step - loss: 0.0260 - acc: 0.9941 - val_loss: 0.4803 - val_acc: 0.8791
Epoch 2/10
17500/17500 [==============================] - 2s 131us/step - loss: 0.0181 - acc: 0.9970 - val_loss: 0.5183 - val_acc: 0.8787
Epoch 3/10
17500/17500 [==============================] - 2s 131us/step - loss: 0.0148 - acc: 0.9978 - val_loss: 0.5404 - val_acc: 0.8739
Epoch 4/10
17500/17500 [==============================] - 2s 132us/step - loss: 0.0132 - acc: 0.9979 - val_loss: 0.8335 - val_acc: 0.8148
Epoch 5/10
17500/17500 [==============================] - 2s 132us/step - loss: 0.0949 - acc: 0.9657 - val_loss: 0.4773 - val_acc: 0.8753
Epoch 6/10
17500/17500 [==============================] - 2s 131us/step - loss: 0.0189 - acc: 0.9958 - val_loss: 0.5145 - val_acc: 0.8796
Epoch 7/10
17500/17500 [==============================] - 2s 132us/step - loss: 0.0133 - acc: 0.9975 - val_loss: 0.5362 - val_acc: 0.8797
Epoch 8/10
17500/17500 [==============================] - 2s 131us/step - loss: 0.0082 - acc: 0.9992 - val_loss: 0.5615 - val_acc: 0.8789
Epoch 9/10
17500/17500 [==============================] - 2s 132us/step - loss: 0.0057 - acc: 0.9995 - val_loss: 0.5888 - val_acc: 0.8763
Epoch 10/10
17500/17500 [==============================] - 2s 131us/step - loss: 0.0050 - acc: 0.9997 - val_loss: 0.6223 - val_acc: 0.8771

二、使用预训练词向量前的文本预处理

主要原则:

  1. 不要使用标准的NLP文本预处理步骤,例如去除停用词等,这样会丢掉有用的信息;

  2. 要使你的词典尽量接近你所使用的embedding;

# 对文本建立词典
def build_vocab(sentences):
    # key is word,value is frequency
    vocab = {}
    for sentence in tqdm(sentences):
        for word in sentence:
            try:
                vocab[word] += 1
            except:
                vocab[word] = 1
    return vocab
def check_coverage(vocab,embeddings_index):
    '''
    统计词典与文本的覆盖率
    '''
    iv = {} # in vocab
    oov = {} # out of vocba
    k = 0
    i = 0
    for word in tqdm(vocab):
        try:
            # 词典中的单词在embedding中
            iv[word] = embeddings_index[word]
            k += vocab[word]
        except:
            oov[word] = vocab[word]
            i += vocab[word]
            pass

    print('Found embeddings for {:.2%} of vocab'.format(len(iv) / len(vocab)))
    print('Found embeddings for  {:.2%} of all text'.format(k / (k + i)))
    sorted_x = sorted(oov.items(), key=operator.itemgetter(1))[::-1]

    return sorted_x
sentences = train['review'].progress_apply(lambda x: x.split()).values
vocab = build_vocab(sentences)
oov = check_coverage(vocab,embeddings_index)
100%|██████████| 25000/25000 [00:00<00:00, 37408.86it/s]
100%|██████████| 25000/25000 [00:01<00:00, 13609.17it/s]
100%|██████████| 289705/289705 [00:00<00:00, 549188.49it/s]


Found embeddings for 23.71% of vocab
Found embeddings for  74.14% of all text

当前词典中仅有23.71%的词在embedding中有对应的词向量,显然这个比例不高,下面我们尝试优化。先看下前10的oov。

oov[:10]
[('a', 155088),
 ('and', 152651),
 ('of', 142970),
 ('to', 132564),
 ('/><br', 50935),
 ('-', 9341),
 ('/>The', 7243),
 ('<br', 5749),
 ('movie.', 4800),
 ('"I', 4455)]

前4的单词是"a",“and”,“of”,“to”,这是因为GoogleNews Embedding在训练时已经去掉了,这些词我们之后处理。

第5和第7分布是"/><br"和"/>The",这是html类文档的标签。

'/>' in embeddings_index
True

由于’/>‘在embedding中,因此可以将它和其他的词分开。而像’.‘和’"'等特殊符号则可以直接去掉。

# 用于清理文本
def clean_text(x):
    x = str(x)
    x.replace('/>', ' /> ')
    for punct in "/-'":
        x = x.replace(punct, ' ')
    for punct in '&':
        x = x.replace(punct, f' {punct} ')
    for punct in '?!.,"#$%\'()*+-/:;<=>@[\\]^_`{|}~' + '“”’':
        x = x.replace(punct, '')
    return x
train["review"] = train["review"].progress_apply(lambda x: clean_text(x))
sentences = train["review"].apply(lambda x: x.split())
vocab = build_vocab(sentences)
oov = check_coverage(vocab,embeddings_index)
100%|██████████| 25000/25000 [00:01<00:00, 24338.81it/s]
100%|██████████| 25000/25000 [00:01<00:00, 15556.86it/s]
100%|██████████| 118159/118159 [00:00<00:00, 676232.77it/s]


Found embeddings for 67.73% of vocab
Found embeddings for  88.41% of all text

可以发现字典覆盖率由23.71%大幅度提高到67.73%,而且文本覆盖率也有很大的提高。再看前10的oov

oov[:10]
[('a', 156185),
 ('and', 155962),
 ('of', 144029),
 ('to', 133847),
 ('10', 4091),
 ('itbr', 985),
 ('moviebr', 838),
 ('filmbr', 790),
 ('20', 684),
 ('80', 591)]

发生数字占很大的比例,这是因为在训练embedding时将数字处理成了’#’,例如’10’处理成‘##’,‘123’处理成’###’,因此将我们训练样本中的数字替换。

def clean_numbers(x):
    x = re.sub('[0-9]{5,}', '#####', x)
    x = re.sub('[0-9]{4}', '####', x)
    x = re.sub('[0-9]{3}', '###', x)
    x = re.sub('[0-9]{2}', '##', x)
    return x
train["review"] = train["review"].progress_apply(lambda x: clean_numbers(x))
sentences = train["review"].progress_apply(lambda x: x.split())
vocab = build_vocab(sentences)
oov = check_coverage(vocab,embeddings_index)
100%|██████████| 25000/25000 [00:03<00:00, 7477.81it/s]
100%|██████████| 25000/25000 [00:00<00:00, 38110.88it/s]
100%|██████████| 25000/25000 [00:01<00:00, 14595.70it/s]
100%|██████████| 117038/117038 [00:00<00:00, 662510.28it/s]

Found embeddings for 68.54% of vocab
Found embeddings for  88.81% of all text

单词覆盖率和文本覆盖率小幅度提高。在看前10的oov:

oov[:10]
[('a', 156185),
 ('and', 155962),
 ('of', 144029),
 ('to', 133847),
 ('itbr', 985),
 ('moviebr', 838),
 ('filmbr', 790),
 ('humour', 424),
 ('timebr', 342),
 ('favourite', 315)]

itbr、moviebr、filmbr、timebr显然br是多余的,导致拼写错误。应该是由html的br标签引起的,所以应该去掉。

train["review"] = train["review"].progress_apply(lambda x:x.replace('br',' '))
sentences = train["review"].progress_apply(lambda x: x.split())
vocab = build_vocab(sentences)
oov = check_coverage(vocab,embeddings_index)
100%|██████████| 25000/25000 [00:00<00:00, 221442.81it/s]
100%|██████████| 25000/25000 [00:00<00:00, 37212.09it/s]
100%|██████████| 25000/25000 [00:01<00:00, 15232.10it/s]
100%|██████████| 106979/106979 [00:00<00:00, 675418.99it/s]


Found embeddings for 74.94% of vocab
Found embeddings for  89.34% of all text
oov[:10]
[('a', 156571),
 ('and', 156104),
 ('of', 144079),
 ('to', 133914),
 ('illiant', 1094),
 ('humour', 433),
 ('ief', 378),
 ('favourite', 316),
 ('eaking', 311),
 ('utal', 293)]

后面是常见的拼写错误,所以找一些常见的拼写错误并将其改正。

def _get_mispell(mispell_dict):
    mispell_re = re.compile('(%s)' % '|'.join(mispell_dict.keys()))
    return mispell_dict, mispell_re


mispell_dict = {'colour':'color',
                'homour':'humor',
                'centre':'center',
                'didnt':'did not',
                'doesnt':'does not',
                'isnt':'is not',
                'shouldnt':'should not',
                'favourite':'favorite',
                'travelling':'traveling',
                'counselling':'counseling',
                'theatre':'theater',
                'cancelled':'canceled',
                'labour':'labor',
                'organisation':'organization',
                'wwii':'world war 2',
                'citicise':'criticize',
                'instagram': 'social medium',
                'whatsapp': 'social medium',
                'snapchat': 'social medium'
                }
mispellings, mispellings_re = _get_mispell(mispell_dict)

def replace_typical_misspell(text):
    def replace(match):
        return mispellings[match.group(0)]

    return mispellings_re.sub(replace, text)
train["review"] = train["review"].progress_apply(lambda x: replace_typical_misspell(x))
sentences = train["review"].progress_apply(lambda x: x.split())
vocab = build_vocab(sentences)
oov = check_coverage(vocab,embeddings_index)
100%|██████████| 25000/25000 [00:01<00:00, 16849.08it/s]
100%|██████████| 25000/25000 [00:00<00:00, 38676.50it/s]
100%|██████████| 25000/25000 [00:01<00:00, 15228.15it/s]
100%|██████████| 106947/106947 [00:00<00:00, 672713.77it/s]


Found embeddings for 74.96% of vocab
Found embeddings for  89.36% of all text

最后将’a’,‘to’,‘and’,'of’去掉

def drop_word(x):
    to_remove = [' a ',' to ',' of ',' and ']
    for punct in to_remove:
        x = x.replace(punct, ' ')
    return x
train["review"] = train["review"].progress_apply(lambda x: drop_word(x))
sentences = train["review"].apply(lambda x: x.split())
vocab = build_vocab(sentences)
oov = check_coverage(vocab,embeddings_index)
100%|██████████| 25000/25000 [00:00<00:00, 41556.35it/s]
100%|██████████| 25000/25000 [00:01<00:00, 16672.09it/s]
100%|██████████| 106947/106947 [00:00<00:00, 624304.26it/s]


Found embeddings for 74.96% of vocab
Found embeddings for  99.22% of all text

文本覆盖率已经接近100%,可以进行训练模型了。

三、使用预处理后的文本进行训练

# 建立tokenizer
tokenizer = Tokenizer(num_words=max_features,lower=True)
tokenizer.fit_on_texts(list(train['review']))
#word_index = tokenizer.word_index
x_train = tokenizer.texts_to_sequences(list(train['review']))
x_train = pad_sequences(x_train,maxlen=maxlen) # padding
y_train = list(train['sentiment'])
# 划分训练和验证集
x_train,x_val,y_train,y_val = train_test_split(x_train,y_train,test_size=0.3,random_state=0)
embedding_matrix = build_matrix(embeddings_index, tokenizer.word_index)
model = build_model(embedding_matrix)
history = model.fit(x_train, y_train, batch_size=512, epochs=10, validation_data=(x_val, y_val))
100%|██████████| 87780/87780 [00:00<00:00, 1104820.79it/s]


Train on 17500 samples, validate on 7500 samples
Epoch 1/10
17500/17500 [==============================] - 3s 192us/step - loss: 0.5736 - acc: 0.7273 - val_loss: 0.3915 - val_acc: 0.8536
Epoch 2/10
17500/17500 [==============================] - 2s 132us/step - loss: 0.3264 - acc: 0.8633 - val_loss: 0.2806 - val_acc: 0.8833
Epoch 3/10
17500/17500 [==============================] - 2s 132us/step - loss: 0.2271 - acc: 0.9112 - val_loss: 0.2614 - val_acc: 0.8899
Epoch 4/10
17500/17500 [==============================] - 2s 132us/step - loss: 0.1750 - acc: 0.9376 - val_loss: 0.2695 - val_acc: 0.8935
Epoch 5/10
17500/17500 [==============================] - 2s 130us/step - loss: 0.1217 - acc: 0.9585 - val_loss: 0.2961 - val_acc: 0.8868
Epoch 6/10
17500/17500 [==============================] - 2s 129us/step - loss: 0.0801 - acc: 0.9757 - val_loss: 0.3426 - val_acc: 0.8837
Epoch 7/10
17500/17500 [==============================] - 2s 129us/step - loss: 0.0550 - acc: 0.9860 - val_loss: 0.3498 - val_acc: 0.8820
Epoch 8/10
17500/17500 [==============================] - 2s 130us/step - loss: 0.0425 - acc: 0.9903 - val_loss: 0.4001 - val_acc: 0.8807
Epoch 9/10
17500/17500 [==============================] - 2s 129us/step - loss: 0.0280 - acc: 0.9946 - val_loss: 0.4493 - val_acc: 0.8824
Epoch 10/10
17500/17500 [==============================] - 2s 132us/step - loss: 0.0178 - acc: 0.9974 - val_loss: 0.4903 - val_acc: 0.8797
  • 4
    点赞
  • 15
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

BQW_

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值