《python深度学习》学习笔记与代码实现（第六章，6.1处理文本数据）

最新推荐文章于 2023-04-03 15:18:16 发布

Tersai

最新推荐文章于 2023-04-03 15:18:16 发布

阅读量977

点赞数

分类专栏：学习笔记文章标签：深度学习 python 机器学习 keras 自然语言处理

本文链接：https://blog.csdn.net/qq_41718518/article/details/89599012

版权

《python深度学习》第六章深度学习用于文本和序列

6.1 处理文本数据

自然语言处理的应用：文档分类，情感分析，作者识别甚至问答

处理文本数据，就是将其转换成数值张量，即文本向量化

# one_hot编码
# 独热编码，是处理文本最常用的编码方式

# one-hot编码简单示例

# 单词级别的one-hot编码
import numpy as np 
samples = ['the cat sat on the mat.','the dog ate my homework.']

token_index = {
   }  # 构建数据中所有标记的索引
for sample in samples:
    for word in sample.split():
        if word not in token_index:
            token_index[word] = len(token_index) + 1  # 为每个唯一单词指定一个唯一的索引
            
# 对样本进行分词，只考虑样本前10个单词
max_length = 10

# 将结果保留在results中
results = np.zeros(shape = (len(samples),max_length,max(token_index.values())+1))

for i ,sample in enumerate(samples):
    for j,word in list(enumerate(sample.split()))[:max_length]:
        index = token_index.get(word)
        results[i,j,index] = 1
print(results)

独热编码

[[[0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
  [0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]

 [[0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]]

# 字符级别的one-hot编码示例

import string

samples = ['the cat sat on the mat.','the dog ate my homework.']
characters = string.printable
token_index = dict(zip(range(1,len(characters) + 1),characters))

max_length = 50
results = np.zeros((len(samples),max_length,max(token_index.keys())+1))

for i ,sample in enumerate(samples):
    for j,character in enumerate(sample):
        index = token_index.get(character)
        results[i,j,index] = 1
        
print(results)

字符级编码

[[[1. 1. 1. ... 1. 1. 1.]
  [1. 1. 1. ... 1. 1. 1.]
  [1. 1. 1. ... 1. 1. 1.]
  ...
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]

 [[1. 1. 1. ... 1. 1. 1.]
  [1. 1. 1. ... 1. 1. 1.]
  [1. 1. 1. ... 1. 1. 1.]
  ...
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]]

# 使用keras的内置函数实现单词级别的one-hot编码

from keras.preprocessing.text import Tokenizer

samples = ['The cat san on the mat.','The dog ate my homework.']

tokenizer = Tokenizer(num_words = 1000)  # 创建一个分词器，设置为只考虑前1000个最常见的单词
tokenizer.fit_on_texts(samples)  # 构建单词索引

sequences = tokenizer.texts_to_sequences(samples)  # 将字符串转换成为由整数索引组成的列表
print(sequences )


one_hot_results = tokenizer.texts_to_matrix(samples,mode = 'binary')
print(one_hot_results)


word_index = tokenizer.word_index

print('Found %s unique tokens.',len(word_index))

[[1, 2, 3, 4, 1, 5], [1, 6, 7, 8, 9]]
[[0. 1. 1. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]]
Found %s unique tokens. 9

samples = ['The cat sat on the mat.', 'The dog ate my homework.']

# We will store our words as vectors of size 1000.
# Note that if you have close to 1000 words (or more)
# you will start seeing many hash collisions, which
# will decrease the accuracy of this encoding method.
dimensionality = 1000
max_length = 10

results = np.zeros((len(samples), max_length, dimensionality))
for i, sample in enumerate(samples):
    for j, word in list(enumerate(sample.split()))[