《python深度学习》学习笔记与代码实现(第六章,6.1处理文本数据)

《python深度学习》第六章深度学习用于文本和序列

6.1 处理文本数据

自然语言处理的应用:文档分类,情感分析,作者识别甚至问答

处理文本数据,就是将其转换成数值张量,即文本向量化

# one_hot编码
# 独热编码,是处理文本最常用的编码方式

# one-hot编码简单示例

# 单词级别的one-hot编码
import numpy as np 
samples = ['the cat sat on the mat.','the dog ate my homework.']

token_index = {
   }  # 构建数据中所有标记的索引
for sample in samples:
    for word in sample.split():
        if word not in token_index:
            token_index[word] = len(token_index) + 1  # 为每个唯一单词指定一个唯一的索引
            
# 对样本进行分词,只考虑样本前10个单词
max_length = 10

# 将结果保留在results中
results = np.zeros(shape = (len(samples),max_length,max(token_index.values())+1))

for i ,sample in enumerate(samples):
    for j,word in list(enumerate(sample.split()))[:max_length]:
        index = token_index.get(word)
        results[i,j,index] = 1
print(results)           

独热编码

[[[0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
  [0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]

 [[0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]]
# 字符级别的one-hot编码示例

import string

samples = ['the cat sat on the mat.','the dog ate my homework.']
characters = string.printable
token_index = dict(zip(range(1,len(characters) + 1),characters))

max_length = 50
results = np.zeros((len(samples),max_length,max(token_index.keys())+1))

for i ,sample in enumerate(samples):
    for j,character in enumerate(sample):
        index = token_index.get(character)
        results[i,j,index] = 1
        
print(results)

字符级编码

[[[1. 1. 1. ... 1. 1. 1.]
  [1. 1. 1. ... 1. 1. 1.]
  [1. 1. 1. ... 1. 1. 1.]
  ...
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]

 [[1. 1. 1. ... 1. 1. 1.]
  [1. 1. 1. ... 1. 1. 1.]
  [1. 1. 1. ... 1. 1. 1.]
  ...
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]]
# 使用keras的内置函数实现单词级别的one-hot编码

from keras.preprocessing.text import Tokenizer

samples = ['The cat san on the mat.','The dog ate my homework.']

tokenizer = Tokenizer(num_words = 1000)  # 创建一个分词器,设置为只考虑前1000个最常见的单词
tokenizer.fit_on_texts(samples)  # 构建单词索引

sequences = tokenizer.texts_to_sequences(samples)  # 将字符串转换成为由整数索引组成的列表
print(sequences )


one_hot_results = tokenizer.texts_to_matrix(samples,mode = 'binary')
print(one_hot_results)


word_index = tokenizer.word_index

print('Found %s unique tokens.',len(word_index))

[[1, 2, 3, 4, 1, 5], [1, 6, 7, 8, 9]]
[[0. 1. 1. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]]
Found %s unique tokens. 9
samples = ['The cat sat on the mat.', 'The dog ate my homework.']

# We will store our words as vectors of size 1000.
# Note that if you have close to 1000 words (or more)
# you will start seeing many hash collisions, which
# will decrease the accuracy of this encoding method.
dimensionality = 1000
max_length = 10

results = np.zeros((len(samples), max_length, dimensionality))
for i, sample in enumerate(samples):
    for j, word in list(enumerate(sample.split()))[
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值