# encoding=utf-8
import jieba
import optparse
import re,codecs
def main():
parser = optparse.OptionParser()
parser.add_option('--kunpeng-txt', type=str, help='')
parser.add_option("--kunpeng-split", type=str, default='')
(args, dummy) = parser.parse_args()
input = args.kunpeng_txt
outfile = open(args.kunpeng_split, 'wb')
pat_assert = u'[^\u4e00-\u9fa55a-zA-Z0-9]'
repat_assert = re.compile(pat_assert)
jieba_cut = ""
for line in codecs.open(input, 'rb', encoding='utf-8', errors='ignore'):
line = line.strip()
string = repat_assert.sub(u'', line)
jieba_cut = jieba.cut(string)
words = ' '.join(jieba_cut)+'\n'
outfile.write(words.encode('utf-8'))
outfile.close()
if __name__ == '__main__':
main()