import collections
import re
defread_time_machine():withopen('/home/kesci/input/timemachine7163/timemachine.txt','r')as f:
lines =[re.sub('[^a-z]+',' ', line.strip().lower())for line in f]return lines
lines = read_time_machine()print('# sentences %d'%len(lines))
2 分词
deftokenize(sentences, token='word'):"""Split sentences into word or char tokens"""if token =='word':return[sentence.split(' ')for sentence in sentences]elif token =='char':return[list(sentence)for sentence in sentences]else:print('ERROR: unkown token type '+token)
tokens = tokenize(lines)
tokens[0:2]
3 建立字典
classVocab(object):def__init__(self, tokens, min_freq=0, use_special_tokens=False):
counter = count_corpus(tokens)# :
self.token_freqs =list(counter.items())
self.idx_to_token =[]if use_special_tokens:# padding, begin of sentence, end of sentence, unknown
self.pad, self.bos, self.eos, self.unk =(0,1,2,3)
self.idx_to_token +=['','','','']else:
self.unk =0
self.idx_to_token +=['']
self.idx_to_token +=[token for token, freq in self.token_freqs
if freq >= min_freq and token notin self.idx_to_token]
self.token_to_idx =dict()for idx, token inenumerate(self.idx_to_token):
self.token_to_idx[token]= idx
def__len__(self):returnlen(self.idx_to_token)def__getitem__(self, tokens):ifnotisinstance(tokens,(list,tuple)):return self.token_to_idx.get(tokens, self.unk)return[self.__getitem__(token)for token in tokens]defto_tokens(self, indices):ifnotisinstance(indices,(list,tuple)):return self.idx_to_token[indices]return[self.idx_to_token[index]for index in indices]defcount_corpus(sentences):
tokens =[tk for st in sentences for tk in st]return collections.Counter(tokens)# 返回一个字典,记录每个词的出现次数
vocab = Vocab(tokens)print(list(vocab.token_to_idx.items())[0:10])
4 将词转为索引
for i inrange(8,10):print('words:', tokens[i])print('indices:', vocab[tokens[i]])
5 用现有工具进行分词
text ="Mr. Chen doesn't agree with my suggestion."import spacy
nlp = spacy.load('en_core_web_sm')
doc = nlp(text)print([token.text for token in doc])from nltk.tokenize import word_tokenize
from nltk import data
data.path.append('/home/kesci/input/nltk_data3784/nltk_data')print(word_tokenize(text))