one-hot编码是将标记转换为向量最常用、最基本的方法。它将每个单词与一个唯一的整数索引相关联,然后将这个整数索引 i i i转为为长度为 N N N的二进制向量( N N N是词表大小),这个向量只有第 i i i个元素是1,其余元素都为0。当然,也可以进行字符级的one-hot编码。
# 单词级的one-hot编码import numpy as np
samples =['The cat sat on the mat.','The dog ate my homework.']
token_index ={
}# 构建数据中所有标记的索引# 为每个唯一单词指定一个唯一索引。注意,没有为索引编号0指定单词。for sample in samples:for word in sample.split():if word notin token_index:
token_index[word]=len(token_index)+1# 对样本进行分词。只考虑每个样本前max_length个单词
max_length =10
results = np.zeros(shape=(len(samples), max_length,max(token_index.values())+1))for i, sample inenumerate(samples):for j, word inlist(enumerate(sample.split()))[:max_length]:
index = token_index.get(word)
results[i, j, index]=1.print(results)
# 字符级的one-hot编码import string
import numpy as np
samples =['The cat sat on the mat.','The dog ate my homework.']
characters = string.printable # 所有可打印的ASCII字符
token_index =dict(zip(characters,range(1,len(characters)+1)))
max_length =50
results = np.zeros((len(samples), max_length,max(token_index.values())+1))for i, sample inenumerate(samples):for j, character inenumerate(sample):
index = token_index.get(character)
results[i, j, index]=1.