直接看代码即可,代码注释还是比较详细的。
1、单词级的 one-hot 编码
import numpy as np
samples = ['the cat sat on the mat.','the dog ate my homework','the chicken is delicious']
token_index = {} # 单词和索引组成的字典
for sample in samples:
for word in sample.split(): # 利用 split 方法对样本进行分词。在实际应用中,还需要从样本中去掉标点和特殊字符
if word not in token_index:
token_index[word] = len(token_index)+1 # 为每个唯一单词指定一个唯一索引。注意,没有为索引编号 0 指定单词
max_length = 10 # 对样本进行分词。只考虑每个样本前 max_length 个单词。这里的样本指的是samples里的每个元素,比如'the cat sat on the mat.'
results = np.zeros(shape=(len(samples),max_length,max(token_index.values())+1))
for i,sample in enumerate(samples):
for j,word in list(enumerate(sample.split()))[:max_length]:
index = token_index.get(word)
results[i,j,index] =1
# i表示样本的索引,比如results[0]表示第一个样本,results[1]表示第二个样本,i+1就表示总共的样本数
# 从results的定义来看,这里用了[max_length,max(token_index.values())+1]维的矩阵来表示一个样本
# 下面几个打印是对上面一些东西的说明
print("token_index:\n",token_index)
print("token_index.values():\n",token_index.values())
print("list(enumerate('the cat sat on the mat.'.split())):\n",list(enumerate('the cat sat on the mat.'.split())))
print("results[0]:\n",results[0]) # results[0]矩阵就是'the cat sat on the mat.'的onehot表示
输出为:
token_index:
{'the': 1, 'cat': 2, 'sat': 3, 'on': 4, 'mat.': 5, 'dog': 6, 'ate': 7, 'my': 8, 'homework': 9, 'chicken': 10, 'is': 11, 'delicious': 12}
token_index.values():
dict_values([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12])
list(enumerate('the cat sat on the mat.'.split())):
[(0, 'the'), (1, 'cat'), (2, 'sat'), (3, 'on'), (4, 'the'), (5, 'mat.')]
results[0]:
[[0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
2、字符级的onehot编码
这里的字符其实就是指单词中的字母
import string
import numpy as np
samples = ['the cat sat on the mat.','the dog ate my homework','the chicken is delicious']
characters = string.printable # 表示所有可打印的 ASCII 字符
token_index = dict(zip(characters,range(1,len(characters)+1))) # 将所有可打印的 ASCII 字符和数字组成一个词典用于后面查询
max_length = 50
results = np.zeros((len(samples),max_length,max(token_index.values())+1))
for i,sample in enumerate(samples):
for j,character in enumerate(sample):
index = token_index.get(character) # 得到字母所对应的数字索引
results[i,j,index] = 1
# i表示样本的索引,比如results[0]表示第一个样本,results[1]表示第二个样本,i+1就表示总共的样本数
# 从results的定义来看,这里用了[max_length,max(token_index.keys())+1]维的矩阵来表示一个样本
# 下面是对上面的一些说明
print("token_index:",token_index)
for x,y in enumerate('the cat sat on the mat.'):
print(x,y)
print("list(enumerate('the cat sat on the mat.')):\n",list(enumerate('the cat sat on the mat.')))
print("results[0]:\n",results[0]) # 第一个样本'the cat sat on the mat.'的onehot表示
输出为:
token_index: {'0': 1, '1': 2, '2': 3, '3': 4, '4': 5, '5': 6, '6': 7, '7': 8, '8': 9, '9': 10, 'a': 11, 'b': 12, 'c': 13, 'd': 14, 'e': 15, 'f': 16, 'g': 17, 'h': 18, 'i': 19, 'j': 20, 'k': 21, 'l': 22, 'm': 23, 'n': 24, 'o': 25, 'p': 26, 'q': 27, 'r': 28, 's': 29, 't': 30, 'u': 31, 'v': 32, 'w': 33, 'x': 34, 'y': 35, 'z': 36, 'A': 37, 'B': 38, 'C': 39, 'D': 40, 'E': 41, 'F': 42, 'G': 43, 'H': 44, 'I': 45, 'J': 46, 'K': 47, 'L': 48, 'M': 49, 'N': 50, 'O': 51, 'P': 52, 'Q': 53, 'R': 54, 'S': 55, 'T': 56, 'U': 57, 'V': 58, 'W': 59, 'X': 60, 'Y': 61, 'Z': 62, '!': 63, '"': 64, '#': 65, '$': 66, '%': 67, '&': 68, "'": 69, '(': 70, ')': 71, '*': 72, '+': 73, ',': 74, '-': 75, '.': 76, '/': 77, ':': 78, ';': 79, '<': 80, '=': 81, '>': 82, '?': 83, '@': 84, '[': 85, '\\': 86, ']': 87, '^': 88, '_': 89, '`': 90, '{': 91, '|': 92, '}': 93, '~': 94, ' ': 95, '\t': 96, '\n': 97, '\r': 98, '\x0b': 99, '\x0c': 100}
0 t
1 h
2 e
3
4 c
5 a
6 t
7
8 s
9 a
10 t
11
12 o
13 n
14
15 t
16 h
17 e
18
19 m
20 a
21 t
22 .
list(enumerate('the cat sat on the mat.')):
[(0, 't'), (1, 'h'), (2, 'e'), (3, ' '), (4, 'c'), (5, 'a'), (6, 't'), (7, ' '), (8, 's'), (9, 'a'), (10, 't'), (11, ' '), (12, 'o'), (13, 'n'), (14, ' '), (15, 't'), (16, 'h'), (17, 'e'), (18, ' '), (19, 'm'), (20, 'a'), (21, 't'), (22, '.')]
results[0]:
[[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
...
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]]
3、用 Keras 实现单词级的 one-hot 编码
Keras 的内置函数可以对原始文本数据进行单词级或字符级的 one-hot 编码。你应该使用这些函数,因为它们实现了许多重要的特性,比如从字符串中去除特殊字符、只考虑数据集中前 N个最常见的单词(这是一种常用的限制,以避免处理非常大的输入向量空间)。
from keras.preprocessing.text import Tokenizer
samples = ['the cat sat on the mat.','the dog ate my homework','the chicken is delicious']
tokenizer = Tokenizer(num_words=1000) # 创建一个分词器(tokenizer),设置为只考虑前 1000 个最常见的单词
tokenizer.fit_on_texts(samples) # 构建单词索引
sequences = tokenizer.texts_to_sequences(samples) # 将字符串转换为整数索引组成的列表
one_hot_results = tokenizer.texts_to_matrix(samples,mode='binary') # 也可以直接得到 one-hot 二进制表示。这个分词器也支持除 one-hot 编码外的其他向量化模式
word_index = tokenizer.word_index # 找回单词索引
# 下面是对上面的一些说明
print("sequences:\n",sequences)
print("one_hot_results:\n",one_hot_results)
print("one_hot_results[0]:\n",one_hot_results[0]) # 第一个样本的onehot表示
print("len(one_hot_results[0]:\n",len(one_hot_results[0]))
print("word_index:\n",word_index)
print("found %s unique tokens." % len(word_index))
输出为:
sequences:
[[1, 2, 3, 4, 1, 5], [1, 6, 7, 8, 9], [1, 10, 11, 12]]
one_hot_results:
[[0. 1. 1. ... 0. 0. 0.]
[0. 1. 0. ... 0. 0. 0.]
[0. 1. 0. ... 0. 0. 0.]]
one_hot_results[0]:
[0. 1. 1. 1. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
len(one_hot_results[0]:
1000
word_index:
{'the': 1, 'cat': 2, 'sat': 3, 'on': 4, 'mat': 5, 'dog': 6, 'ate': 7, 'my': 8, 'homework': 9, 'chicken': 10, 'is': 11, 'delicious': 12}
found 12 unique tokens.
4、使用散列技巧的单词级的onehot编码
one-hot 编码的一种变体是所谓的 one-hot 散列技巧(one-hot hashing trick),如果词表中唯一标记的数量太大而无法直接处理,就可以使用这种技巧。这种方法没有为每个单词显式分配一个索引并将这些索引保存在一个字典中,而是将单词散列编码为固定长度的向量,通常用一个非常简单的散列函数来实现。这种方法的主要优点在于,它避免了维护一个显式的单词索引,从而节省内存并允许数据的在线编码(在读取完所有数据之前,你就可以立刻生成标记向量)。
这种方法有一个缺点,就是可能会出现散列冲突(hash collision),即两个不同的单词可能具有相同的散列值,随后任何机器学习模型观察这些散列值,都无法区分它们所对应的单词。如果散列空间的维度远大于需要散列的唯一标记的个数,散列冲突的可能性会减小。
下面例子中,通过index = abs(hash(word)) % dimensionality来给单词生成索引,而不是通过自己去构建词典来得到索引,单词向量的长度为1000。但index = abs(hash(word)) % dimensionality有可能导致两个不同单词的索引index相同。
import numpy as np
samples = ['the cat sat on the mat.','the dog ate my homework','the chicken is delicious']
dimensionality = 1000 # 散列空间维度为1000
max_length = 10
results = np.zeros((len(samples),max_length,dimensionality))
for i,sample in enumerate(samples):
for j,word in list(enumerate(sample.split()))[:max_length]:
index = abs(hash(word)) % dimensionality
print(word,"'s hash value: ", hash(word))
print(word,"'s index: ",index)
results[i,j,index] = 1
# i表示样本的索引,比如results[0]表示第一个样本,results[1]表示第二个样本,i+1就表示总共的样本数
# 从results的定义来看,这里用了[max_length,dimensionality]维的矩阵来表示一个样本
print("results[0]:",results[0]) # 第一个样本的onehot表示
输出为:
the 's hash value: -1087449444902206732
the 's index: 732
cat 's hash value: 3320912556947376308
cat 's index: 308
sat 's hash value: -7706466985899488210
sat 's index: 210
on 's hash value: 1252657084041796533
on 's index: 533
the 's hash value: -1087449444902206732
the 's index: 732
mat. 's hash value: 1381930449557149416
mat. 's index: 416
the 's hash value: -1087449444902206732
the 's index: 732
dog 's hash value: -3073929831791817846
dog 's index: 846
ate 's hash value: 5090131066225908060
ate 's index: 60
my 's hash value: 497248659279338412
my 's index: 412
homework 's hash value: -6911085521957117321
homework 's index: 321
the 's hash value: -1087449444902206732
the 's index: 732
chicken 's hash value: 8084250583515557290
chicken 's index: 290
is 's hash value: -118710236214562403
is 's index: 403
delicious 's hash value: -8226280207471691178
delicious 's index: 178
results[0]: [[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
...
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]]
上面文字和代码大部分参考了《python深度学习》,自己做了一些删减和说明,给代码加了一些注释,希望能更易懂一些,希望对各位有所帮助。