文本预处理;语言模型;循环神经网络
2.11-2.14
一、文本预处理
(1)读入文本
import collections
import re
#逐行读取文本
def read_time_machine():
with open('/home/kesci/input/timemachine7163/timemachine.txt', 'r') as f:
lines = [re.sub('[^a-z]+', ' ', line.strip().lower()) for line in f]
return lines
lines = read_time_machine()
print('# sentences %d' % len(lines))
(2)分词
def tokenize(sentences, token='word'):
"""Split sentences into word or char tokens"""
if token == 'word': #按单次分词
return [sentence.split(' ') for sentence in sentences]
elif token == 'char': #按字符分词
return [list(sentence) for sentence in sentences]
else:
print('ERROR: unkown token type '+token)
tokens = tokenize(lines)
tokens[0:2]
(3)建立字典
class Vocab(object)