文本预处理
文本预处理基本过程
- 读入文本
- 分词
- 建立字典,将每个词映射到一个唯一的索引
- 将文本从词的序列转换为索引的序列,方便输入模型
1. 读入文本
import collections
import re
def read_time_machine():
with open('./Document.txt', 'r') as f:
lines = [re.sub('[^a-z]+', ' ', line.strip().lower()) for line in f] ## strip()用于去除文本前后的空格
return lines
lines = read_time_machine()
print(len(lines))
print(lines)
2. 分词
将句子分成若干个词
def tokenize(sentences, token='word'):
'''将句子分成若干个词 或 字符'''
if(token == 'word'):
return [sentence.split(' ') for sentence in sentences]
elif(token == 'char'):
return [list(sentence) for sentence in sentences]
else:
print('ERROE: unkown token type ' + token)
tokens = tokenize(lines)
print(tokens)
3. 建立词典
class Vocab(object):
def __init__(self, tokens, min_freq = 0, use_special_tokens=True):
count_tokens = count_corpus(tokens)
self.token_freqs = list(count_tokens.items())
self.idx_to_token = [] ## 由索引号 索引 token
if use_special_tokens :
self.pad, self.bos, self.eos, self.unk = (0, 1, 2, 3)
self.idx_to_token += ['pad', 'bos', 'eos', 'unk']
else:
self.unk = 0
self.idx_to_tokens += ['unk']
self.idx_to_token += [token for token, freq in self.token_freqs
if freq >= min_freq and token not in self.idx_to_token]
self.token_to_idx = dict() ## 由token 索引 索引号
for idx, token in enumerate(self.idx_to_token):
self.token_to_idx[token] = idx
def __len__(self):
return len(self.idx_to_token)
def __getitem__(self, tokens):
if not isinstance(tokens, (list, tuple)):
return self.token_to_idx.get(tokens, self.unk)
return [self.__getitem__(token) for token in tokens]
def to_tokens(self, indices):
if not isinstance(tokens, (list, tuple)):
return self.idx_to_tokens[indices]
return [self.idx_to_tokens[index] for index in indices]
def count_corpus(sentences):
tokens = [tk for st in sentences for tk in st]
return collections.Counter(tokens)
4. 将文本中的词转换为索引
for i in range(3):
print('word : ', tokens[i])
results = []
for token in tokens[i]:
results.append(vocab.token_to_idx[token])
print('indices : ', results)