# Seq2Seq Tensorflow 实现之数据处理

## 一、数据处理

（稍后会上传代码）

1、实现中文、英文的分词
2、创建字典、以实现单词和索引之间的转换

1.英文分词：

WORD_SPLIT= r'[,.!?/\':;\")(]'
def en_tokenize(sentence):
"""split the sentence with WORD_SPLIT into a list of lower words"""
words = []
for space_separated_fragment in sentence.strip().split():
words.extend(re.split(WORD_SPLIT, space_separated_fragment))
return [w.lower() for w in words if w]

2.中文分词:

stopwords=[r'，',r'。',r'？',r'\'',r'“',r'',r'、',r'！',r'…',r'”',r'“',r'-',r'...',r'（'
r'）',r'；',r'·',r'......',r'(',r')','\"']
def ch_tokenize(sentence):
"""same as en_tokenize,this is for chinese tokenize"""
words=[]
words.extend(jieba.cut(sentence))
for word in words:
re.sub(WORD_SPLIT,' ',word)
return [w for w in words if w not in stopwords]

3、创建字典

def create_vocabulary(vocabulary_path, data_path,max_vocabulary_size,tokenizer):
print("Creating vocabulary %s from %s" % (vocabulary_path, data_path))
vocab = {}
with codecs.open(data_path, mode="r",encoding='utf-8') as f:
counter = 0
for line in f:
counter += 1
tokens = tokenizer(line)
if counter % 100000 == 0:
print("  processing line %d" % counter)
for w in tokens:
if w in vocab:
vocab[w] += 1
else:
vocab[w] = 1
vocab_list = _START_VOCAB + sorted(vocab, key=vocab.get, reverse=True)
print('>> Full Vocabulary Size :',len(vocab_list))
if len(vocab_list) > max_vocabulary_size:
vocab_list = vocab_list[:max_vocabulary_size]
with codecs.open(vocabulary_path, mode="w",encoding='utf-8') as vocab_file:
for w in vocab_list:
vocab_file.write(w + "\n")

def initialize_vocabulary(vocabulary_path):
"""return vocab:vocab_dict{word:index}
"""
rev_vocab = []
with codecs.open(vocabulary_path, mode="r",encoding='utf-8') as f:
rev_vocab = [line.strip() for line in rev_vocab]
vocab = dict([(x, y) for (y, x) in enumerate(rev_vocab)])
return vocab


def sentence_to_token_ids(sentence, vocabulary, tokenizer):
"""
vocabulary:the vocab_dict return from initialize_vocabulary
"""
words = tokenizer(sentence)
return [vocabulary.get(w, UNK_ID) for w in words]

• 广告
• 抄袭
• 版权
• 政治
• 色情
• 无意义
• 其他

120