NLP预处理 - 将序列处理成embedding - 方法1 - keras调包

 I will use several examples to show the data processing methods.

Apart from generating embedding, the process of training model in epoch level and generating batch data is also useful.

 

Case 1 - Text classification with LSTM

import tensorflow as tf
import os
from text_classify_rnn_cnn.data.cnews_loader import read_vocab, read_category, batch_iter, process_file, process_predict_file
import numpy as np
from sklearn import metrics


seq_length = 600
num_classes = 10
vocab_size = 5000
embedding_dim = 600

num_layers = 2
hidden_dim = 128
dropout_keep_prob = 0.8
learning_rate = 0.00005

batch_size = 128
num_epochs = 10

base_dir = 'data/cnews'
train_dir = os.path.join(base_dir, 'cnews.train.txt')
test_dir = os.path.join(base_dir, 'cnews.test.txt')
val_dir = os.path.join(base_dir, 'cnews.val.txt')
vocab_dir = os.path.join(base_dir, 'cnews.vocab.txt')  # vocab.txt already generated

input_x = tf.placeholder(tf.int32, [None, seq_length])
input_y = tf.placeholder(tf.float32, [None, num_classes])
keep_prob = tf.placeholder(tf.float32)


def text_rnn_mlp():

    embedding = tf.get_variable('embedding', [vocab_size, embedding_dim])
    embedding_input = tf.nn.embedding_lookup(embedding, input_x)

    def lstm_cell():
        cell = tf.contrib.rnn.BasicLSTMCell(hidden_dim, state_is_tuple=True)
        dropout = tf.contrib.rnn.DropoutWrapper(cell, output_keep_prob=keep_prob)
        return dropout

    cells = [lstm_cell() for _ in range(num_layers)]  # cell的一个list而已
    rnn_cell = tf.contrib.rnn.MultiRNNCell(cells, state_is_tuple=True)
    _output, state = tf.nn.dynamic_rnn(cell=rnn_cell, inputs=embedding_input, dtype=tf.float32)
    last_output = _output[:, -1, :]

    # mlp
    fc1 = tf.layers.dense(inputs=last_output, units=hidden_dim, name='fc1')
    fc1_dropout = tf.contrib.layers.dropout(fc1, keep_prob)
    fc1_relu = tf.nn.relu(fc1_dropout)

    logits = tf.layers.dense(inputs=fc1_relu, units=num_classes, name='fc2')
    y_pred = tf.argmax(tf.nn.softmax(logits), 1)

    cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits= logits, labels=input_y)
    loss = tf.reduce_mean(cross_entropy)
    train_steps = tf.train.GradientDescentOptimizer(learning_rate=learning_rate).minimize(loss)

    correct_pred = tf.equal(y_pred, tf.argmax(input_y, 1))
    acc = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
    return loss, train_steps, acc, y_pred


def train_model(state_flag):
    total_train_acc = 0.0
    loss, train_steps, acc, y_pred = text_rnn_mlp()
    saver = tf.train.Saver()
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)

    session = tf.Session()
    session.run(tf.global_variables_initializer())

    x_train, y_train = process_file(train_dir, word_to_id, cat_to_id, seq_length)
    x_val, y_val = process_file(val_dir, word_to_id, cat_to_id, seq_length)
    print('len(x_train)', len(x_train), 'len(x_val)', len(x_val))
    print(x_train[0:2])

    if state_flag == 'train':
        for i in range(num_epochs):
            batch_train = batch_iter(x_train, y_train, batch_size)

            for x_batch, y_batch in batch_train:
                batch_num = len(x_batch)
                session.run(train_steps, feed_dict={input_x: x_batch, input_y: y_batch, keep_prob: 0.75})
                loss2, accuracy2 = session.run([loss, acc], feed_dict={input_x: x_batch, input_y: y_batch, keep_prob: 0.75})
                print("accuracy : %f ,  loss : %g" % (accuracy2, loss2))
                total_train_acc = total_train_acc + accuracy2*batch_num

            train_acc = total_train_acc / len(x_train)
            print('epoch:%d,  train_acc:%f' % (i, train_acc))
            val_loss, val_acc = session.run([loss, acc], feed_dict={input_x: x_val[0:256], input_y: y_val[0:256], keep_prob: 0.75})
            print('epoch:%d,  val_acc:%f' % (i, val_acc))
            if val_acc > train_acc:
                saver.save(session, save_dir+'my_rnn_model.ckpt')

    if state_flag == 'test':
        ckpt = tf.train.get_checkpoint_state(save_dir)
        if ckpt and ckpt.model_checkpoint_path:
            saver.restore(session, ckpt.model_checkpoint_path)

            x_test, y_test = process_file(test_dir, word_to_id, cat_to_id, seq_length)
            print('len(x_test)', len(x_test))
            y_test_labels = np.argmax(y_test, 1)
            batch_size2 = 128
            data_len = len(x_test)
            num_batch = int((data_len - 1) / batch_size2) + 1

            y_pred_labels = np.zeros(shape=len(x_test), dtype=np.int32)  # 保存预测结果
            for i in range(num_batch):  # 逐批次处理
                start_id = i * batch_size2
                end_id = min((i + 1) * batch_size2, data_len)
                y_pred_labels[start_id: end_id] = session.run(y_pred, feed_dict={input_x: x_test[start_id: end_id], keep_prob: 0.75})

            print("Precision, Recall and F1-Score...")
            print(metrics.classification_report(y_test_labels, y_pred_labels))

            # 混淆矩阵
            print("Confusion Matrix...")
            cm = metrics.confusion_matrix(y_test_labels, y_pred_labels)
            print(cm)

    if state_flag == 'predict':
        ckpt = tf.train.get_checkpoint_state(save_dir)
        if ckpt and ckpt.model_checkpoint_path:
            saver.restore(session, ckpt.model_checkpoint_path)

            x_pred = process_predict_file(pred_dir, word_to_id, seq_length)
            batch_size3 = 128
            data_len = len(x_pred)
            num_batch = int((data_len - 1) / batch_size3) + 1

            y_pred_labels = np.zeros(shape=len(x_pred), dtype=np.int32)  # 保存预测结果
            for i in range(num_batch):  # 逐批次处理
                start_id = i * batch_size3
                end_id = min((i + 1) * batch_size3, data_len)
                y_pred_labels[start_id: end_id] = session.run(y_pred, feed_dict={input_x: x_pred[start_id: end_id], keep_prob: 0.75})

            print('y_pred_labels', y_pred_labels)
            categories = ['体育', '财经', '房产', '家居', '教育', '科技', '时尚', '时政', '游戏', '娱乐']
            for id in range(len(y_pred_labels)):
                print('第 %d 条新闻的类别是:  %s' % (id+1, categories[y_pred_labels[id]]))


save_dir = 'text_classify_rnn_cnn/my_ckpt/'
pred_dir = 'text_classify_rnn_cnn/pred_file_input.txt'
categories, cat_to_id = read_category()
words, word_to_id = read_vocab(vocab_dir)  # vocab.txt already generated
train_model('train')

The process of generating embedding is as follows: 

cnews_loader.py

# coding: utf-8

import sys
from collections import Counter

import numpy as np
import tensorflow.contrib.keras as kr

if sys.version_info[0] > 2:
    is_py3 = True
else:
    # reload(sys)
    sys.setdefaultencoding("utf-8")
    is_py3 = False


def native_word(word, encoding='utf-8'):
    """如果在python2下面使用python3训练的模型,可考虑调用此函数转化一下字符编码"""
    if not is_py3:
        return word.encode(encoding)
    else:
        return word


def native_content(content):
    if not is_py3:
        return content.decode('utf-8')
    else:
        return content


def open_file(filename, mode='r'):
    # mode: 'r' or 'w' for read or write
    if is_py3:
        return open(filename, mode, encoding='utf-8', errors='ignore')
    else:
        return open(filename, mode)


def read_file(filename):
    """读取文件数据"""
    contents, labels = [], []
    with open_file(filename) as f:
        for line in f:
            try:
                label, content = line.strip().split('\t')
                if content:
                    contents.append(list(native_content(content)))
                    labels.append(native_content(label))
            except:
                pass
    return contents, labels


# content, label = read_file('D:/DL/lstm_blog_case/text_classify_rnn_cnn/data/cnews/cnews.train.txt')
# print("content",content[0:2])  #    content [['马', '晓', '旭', '意', '外', '受', '伤'....]]
# print("label",label[0:10])    #   ['体育', '体育'.....]


def build_vocab(train_dir, vocab_dir, vocab_size=5000):
    # generate global vocabulary words list
    #
    data_train, _ = read_file(train_dir)

    all_data = []
    for content in data_train:
        all_data.extend(content)

    counter = Counter(all_data)
    count_pairs = counter.most_common(vocab_size - 1)
    # 每个词出现的频次
    words, _ = list(zip(*count_pairs))
    #print(list(zip(*count_pairs)))
    # 添加一个 <PAD> 来将所有文本pad为同一长度
    words = ['<PAD>'] + list(words)
    open_file(vocab_dir, mode='w').write('\n'.join(words) + '\n')


build_vocab('D://DL//lstm_blog_case//text_classify_rnn_cnn//data//cnews//cnews.train.txt',
            'D://DL//lstm_blog_case//text_classify_rnn_cnn//write.txt', vocab_size=5000)


def read_vocab(vocab_dir):
    """读取词汇表"""
    # words = open_file(vocab_dir).read().strip().split('\n')
    with open_file(vocab_dir) as fp:
        # 如果是py2 则每个值都转化为unicode
        words = [native_content(_.strip()) for _ in fp.readlines()]
    word_to_id = dict(zip(words, range(len(words))))
    return words, word_to_id


def read_category():
    """读取分类目录,固定"""
    categories = ['体育', '财经', '房产', '家居', '教育', '科技', '时尚', '时政', '游戏', '娱乐']

    categories = [native_content(x) for x in categories]

    cat_to_id = dict(zip(categories, range(len(categories))))

    return categories, cat_to_id


def to_words(content, words):
    """将id表示的内容转换为文字"""
    return ''.join(words[x] for x in content)


def process_file(filename, word_to_id, cat_to_id, max_length=600):
    """convert all the words in the file into id (number)"""
    contents, labels = read_file(filename)

    data_id, label_id = [], []
    for i in range(len(contents)):
        data_id.append([word_to_id[x] for x in contents[i] if x in word_to_id])
        label_id.append(cat_to_id[labels[i]])

    # 使用keras提供的pad_sequences来将文本pad为固定长度
    x_pad = kr.preprocessing.sequence.pad_sequences(data_id, max_length)
    y_pad = kr.utils.to_categorical(label_id, num_classes=len(cat_to_id))  # 将标签转换为one-hot表示

    return x_pad, y_pad


def batch_iter(x, y, batch_size=64):
    """生成批次数据"""
    data_len = len(x)
    num_batch = int((data_len - 1) / batch_size) + 1

    indices = np.random.permutation(np.arange(data_len))
    x_shuffle = x[indices]
    y_shuffle = y[indices]

    for i in range(num_batch):
        start_id = i * batch_size
        end_id = min((i + 1) * batch_size, data_len)
        yield x_shuffle[start_id:end_id], y_shuffle[start_id:end_id]


# add my own function in prediction

def read_predcit_file(filename):
    """读取文件数据"""
    contents, labels = [], []
    with open_file(filename) as f:
        for line in f:
            try:
                content = line.strip().split('\t')
                if content:
                    contents.append(list(native_content(content)))
            except:
                pass
    return contents


def process_predict_file(filename, word_to_id, max_length=600):
    """将文件转换为id表示"""
    contents = read_predcit_file(filename)
    data_id = []
    for i in range(len(contents)):
        data_id.append([word_to_id[x] for x in contents[i] if x in word_to_id])
    # 使用keras提供的pad_sequences来将文本pad为固定长度
    x_pad = kr.preprocessing.sequence.pad_sequences(data_id, max_length)

    return x_pad

1. Generate global vocabulary dictionary, which can link a unque word with unique index.

In this code, tt assume the vocabulary_word_list is already generated.

2. Converting the words into its index in the vocabulary_word_list, so as to use index to represent text note.

For instance, each piece of note will be represented as this,  [1, 34, 32, 1006, 200, ....]

By the way, the data is in Chinese, so the each token is just a Chinese word.

3. Padding the input text (input data) and labels.

Here, it uses a package in Keras.

import tensorflow.contrib.keras as kr


def process_file(filename, word_to_id, cat_to_id, max_length=600):
    """convert all the words in the file into id (number)"""
    contents, labels = read_file(filename)

    data_id, label_id = [], []
    for i in range(len(contents)):
        data_id.append([word_to_id[x] for x in contents[i] if x in word_to_id])
        label_id.append(cat_to_id[labels[i]])

    # 使用keras提供的pad_sequences来将文本pad为固定长度
    x_pad = kr.preprocessing.sequence.pad_sequences(data_id, max_length)
    y_pad = kr.utils.to_categorical(label_id, num_classes=len(cat_to_id))  # 将标签转换为one-hot表示

    return x_pad, y_pad

If the max_length is set 800, zeros will be padded. 

[[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0  387 1197 2173  215  110  264  814  219   16  725  981 2629
   245 1645   14 1190  231  110  808 2026 2784   43  581  224   98 2345
   470 1190 1609  659  188  209   34   32 1609  659    1   16  725  244
   716  153  165    8 1309 1362 1190  298    2 1061 1478    3  105   70
    49   12   62   61  951   91  164    1   16  725  244    2   62  234
   977  851  333  144  264   32   14 1190    2  900 1478    1  245 1645
   100   61  244  200   40  183 1181 1243   10   18   55   52  883   56
  1191 1191  246   57    3   49   12   62   20  951   12    7  164    1
    16  725  244    6  725  134   11  169  110   57  977  851    2   27
   475    1  125   56    5 1675 1327 1327    2    1  405  554  468  188
   336  185  143  125   61  951 1609  659   56    8   14 1190    1  108
  1135  121  244 1564   20  951    2  977  851  194  165    8  264   32
   330  464  900 1478    3   61  951   91  164    1  143  157  244 1296
   271  977  851   57   27    1   14 1190  167   63   61   10  385   22
   122   27    1   80  505 1055 1342  165    8  886   61   34    2  215
   730    3 1551  205  538    4  538    2  608  144    1  157  244   72
   404   10  143  125   61  951    2  644   36  977  851    1   18   55
    52  883   66  202   10    1  125  405  165    8  330  464  490  121
     2 1278  554    1   21   10  232  797  157  200   40    1   16  725
   244  526  126   11  853  143  125    2  977  851    1  117  244  371
   534 1404  267 1070  832    3    6 1190   11  977  851   39  589  157
   244   34   84  194    9    5  421  217 1712 1993  182    1  108    6
   725  492   35  534   86   72  404  100   65    1  117  244  326   68
    23 1950 1052   24   10    3    6 1609  659   71   59    4  309    2
   977  851    1   16  725  244  321  332   41  232  297   54    8    2
   157  200    9  333   33   54  215  110    2  814  931  162  477   31
   831  120  593  247  253   81  212    1  166  158   19    4 1015  576
   718  239  977  851  264  814   15  718  239  242 1569  151 1763  931
     2   33   54  230  244 1564  358    6   10  161  143  155   41    2
   172  555    3   80 1296  271 1609  659  100   59    1   11   59  699
  1361 2409 1584   56    4  339  165    8  977  851    1 1361 2409 1584
     5  105   70   18  105   62    6  132  744 1533   20   10  242 1569
     1  166  158   46  165    8  493  115   18   77   62  654 1269  257
   703  470    2  422  252  212    3  244 1564  639  845   84    1 1361
  2409 1584  194  165    8   33   54   37 1808 1758  721    1  108   21
    10  324  117  220  503    1   19  173  125   96    5  219   44 1084
  1012  961  365    1  151  242 1569 1464  611  121   10  100   59  333
  1434  562  977  851    3  269   60    8   10 1361 2409 1584   19   22
   644  139    1  166  158   16  725  244   39 1190   11  977  851   56
   336   68  170  316 1619 1524    1  109   41    5  747  169  157  200
    40  264 1931   80  545   37  242 1569    1  345   50  282  283 1040
   769  200    3   80  291  589  244  200  387 1197 2173    6  422  252
   212   11  264  814  293  610  245 1462  725  492    2   65  228    1
    46  219    6 1609  659    2   16  725  244   54    6  231  110  981
  2629    1   23  977  851   11    9  631 3222 4015  244  200   40   41
   483  215  111   69    1   28   40   51    9   45  333   33   19  184
     2  182  162   10    3   24    4  172  152   69   13  200  127  185
     3  175  132  744   32 1609  659    1 1190  298    4  533 1333  546
   205   16  725  244    1   23   46 2132   10    1   28   40  531   32
   986  662 1190   56   61   32  986  662    1    6  132  744  385  130
   977  851   88  230   14 1190  305 3011   30   10    1  165  265   32
    34 1609  659  433 1635   32   19  187  182  162    3   24    4  172
    16  725  157  200   46   39 1190  298    2   23  808 2026   24    8
   156    9  311    3]]

 

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
### 回答1: 自然语言处理(Natural Language Processing,简称NLP)是计算机科学与人工智能领域的一个重要研究方向,目的是让计算机能够理解、处理和生人类的自然语言。NLP-100例是一份经典的NLP问题集合,包含了各种与自然语言处理相关的问题和挑战。 这份NLP-100例涵盖了从基础的文本处理到更高级的自然语言理解和生的问题。例如,其中包括了文本预处理、词频统计、语法分析、词性标注、实体识别、情感分析、机器翻译等任务。 NLP-100例的目的是帮助研究者和开发者更好地理解NLP领域的核心问题和技术,同时提供一些典型的案例和数据集供实践和研究使用。通过完这些例题,可以锻炼自己在NLP领域的能力和技术,提高对自然语言的处理和理解能力。 此外,NLP-100例也为研究者提供了一个可以与其他人交流和探讨的平台。研究者可以使用相同的数据集和问题进行实验和评估,从而更好地了解NLP技术的优劣和进展。 总之,NLP-100例是一个对NLP进行实践和研究的重要资源。通过解决这些例题,可以深入理解自然语言处理的基础和技术,掌握各种NLP任务的方法和技巧。同时,它也是一个促进交流和合作的平台,为NLP研究者提供了一个共同的基础和语言。 ### 回答2: 自然语言处理(Natural Language Processing,简称NLP)是研究计算机与人类自然语言之间的交互的一门学科。NLP-100例指的是日本的一个NLP入门教程,包含了100个常见的NLP问题和对应的解答。 NLP-100例涵盖了从文本处理到语义理解等多个方面的问题。其中,一些例子包括:文本的分词、词性标注、句法分析、语义角色标注和文本分类等。 以分词为例,分词是将一段连续的文本分割词语的过程。在NLP-100例中,可以通过使用Python中的分词工具NLTK(Natural Language Toolkit)来实现分词功能。 另外,对于文本的词性标注,NLP-100例提供了使用POS(Part-Of-Speech)标记对文本中的每个词进行词性标注的方法。可以使用NLTK提供的POS标注工具来实现。 此外,NLP-100例还包括了语义角色标注的问题,语义角色标注是为了确定句子中的谓语动词所承担的语义角色,如施事者、受事者、时间等。可以使用Stanford CoreNLP工具包来实现语义角色标注。 最后,NLP-100例还介绍了文本分类的问题,文本分类是将文本划分到预定义的类别中。可以使用机器学习算法,如朴素贝叶斯或支持向量机(SVM)等来进行文本分类。 通过学习NLP-100例,我们可以了解到自然语言处理的基本方法和技术,并且可以利用这些技术来解决相关的自然语言处理问题。

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值