《python深度学习》第六章深度学习用于文本和序列
6.1 处理文本数据
自然语言处理的应用:文档分类,情感分析,作者识别甚至问答
处理文本数据,就是将其转换成数值张量,即文本向量化
# one_hot编码
# 独热编码,是处理文本最常用的编码方式
# one-hot编码简单示例
# 单词级别的one-hot编码
import numpy as np
samples = ['the cat sat on the mat.','the dog ate my homework.']
token_index = {
} # 构建数据中所有标记的索引
for sample in samples:
for word in sample.split():
if word not in token_index:
token_index[word] = len(token_index) + 1 # 为每个唯一单词指定一个唯一的索引
# 对样本进行分词,只考虑样本前10个单词
max_length = 10
# 将结果保留在results中
results = np.zeros(shape = (len(samples),max_length,max(token_index.values())+1))
for i ,sample in enumerate(samples):
for j,word in list(enumerate(sample.split()))[:max_length]:
index = token_index.get(word)
results[i,j,index] = 1
print(results)
独热编码
[[[0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
[0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
[[0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]]
# 字符级别的one-hot编码示例
import string
samples = ['the cat sat on the mat.','the dog ate my homework.']
characters = string.printable
token_index = dict(zip(range(1,len(characters) + 1),characters))
max_length = 50
results = np.zeros((len(samples),max_length,max(token_index.keys())+1))
for i ,sample in enumerate(samples):
for j,character in enumerate(sample):
index = token_index.get(character)
results[i,j,index] = 1
print(results)
字符级编码
[[[1. 1. 1. ... 1. 1. 1.]
[1. 1. 1. ... 1. 1. 1.]
[1. 1. 1. ... 1. 1. 1.]
...
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]]
[[1. 1. 1. ... 1. 1. 1.]
[1. 1. 1. ... 1. 1. 1.]
[1. 1. 1. ... 1. 1. 1.]
...
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]]]
# 使用keras的内置函数实现单词级别的one-hot编码
from keras.preprocessing.text import Tokenizer
samples = ['The cat san on the mat.','The dog ate my homework.']
tokenizer = Tokenizer(num_words = 1000) # 创建一个分词器,设置为只考虑前1000个最常见的单词
tokenizer.fit_on_texts(samples) # 构建单词索引
sequences = tokenizer.texts_to_sequences(samples) # 将字符串转换成为由整数索引组成的列表
print(sequences )
one_hot_results = tokenizer.texts_to_matrix(samples,mode = 'binary')
print(one_hot_results)
word_index = tokenizer.word_index
print('Found %s unique tokens.',len(word_index))
[[1, 2, 3, 4, 1, 5], [1, 6, 7, 8, 9]]
[[0. 1. 1. ... 0. 0. 0.]
[0. 1. 0. ... 0. 0. 0.]]
Found %s unique tokens. 9
samples = ['The cat sat on the mat.', 'The dog ate my homework.']
# We will store our words as vectors of size 1000.
# Note that if you have close to 1000 words (or more)
# you will start seeing many hash collisions, which
# will decrease the accuracy of this encoding method.
dimensionality = 1000
max_length = 10
results = np.zeros((len(samples), max_length, dimensionality))
for i, sample in enumerate(samples):
for j, word in list(enumerate(sample.split()))[