一、数据处理
代码地址:https://github.com/Wang-Anna
(稍后会上传代码)
1、实现中文、英文的分词
2、创建字典、以实现单词和索引之间的转换
1.英文分词:
WORD_SPLIT= r'[,.!?/\':;\")(]'
def en_tokenize(sentence):
"""split the sentence with WORD_SPLIT into a list of lower words"""
words = []
for space_separated_fragment in sentence.strip().split():
words.extend(re.split(WORD_SPLIT, space_separated_fragment))
return [w.lower() for w in words if w]
2.中文分词:
stopwords=[r',',r'。',r'?',r'\'',r'“',r'',r'、',r'!',r'…',r'”',r'“',r'-',r'...',r'('
r')',r';',r'·',r'......',r'(',r')','\"']
def ch_tokenize(sentence):
"""same as en_tokenize,this is for chinese tokenize"""
words=[]
words.extend(jieba.cut(sentence))
for word in words:
re.sub(WORD_SPLIT,' ',word)
return [w for w in words if w not in stopwords]
3、创建字典
生成字典文件
def create_vocabulary(vocabulary_path, data_path,max_vocabulary_size,tokenizer):
print("Creating vocabulary %s from %s" % (vocabulary_path, data_path))
vocab = {}
with codecs.open(data_path, mode="r",encoding='utf-8') as f:
counter = 0
for line in f:
counter += 1
tokens = tokenizer(line)
if counter % 100000 == 0:
print(" processing line %d" % counter)
for w in tokens:
if w in vocab:
vocab[w] += 1
else:
vocab[w] = 1
vocab_list = _START_VOCAB + sorted(vocab, key=vocab.get, reverse=True)
print('>> Full Vocabulary Size :',len(vocab_list))
if len(vocab_list) > max_vocabulary_size:
vocab_list = vocab_list[:max_vocabulary_size]
with codecs.open(vocabulary_path, mode="w",encoding='utf-8') as vocab_file:
for w in vocab_list:
vocab_file.write(w + "\n")
从字典文件里读取单词创建单词-索引dict
def initialize_vocabulary(vocabulary_path):
"""return vocab:vocab_dict{word:index}
"""
rev_vocab = []
with codecs.open(vocabulary_path, mode="r",encoding='utf-8') as f:
rev_vocab.extend(f.readlines())
rev_vocab = [line.strip() for line in rev_vocab]
vocab = dict([(x, y) for (y, x) in enumerate(rev_vocab)])
return vocab
将字符串转成相应的索引
def sentence_to_token_ids(sentence, vocabulary, tokenizer):
"""
vocabulary:the vocab_dict return from initialize_vocabulary
"""
words = tokenizer(sentence)
return [vocabulary.get(w, UNK_ID) for w in words]