Python3 读取TXT的时候会报字符编码的bug
for line in open(corpus_path, 'r'):
content = line.strip()
seg_list = seg_to_list(content, pos)
filter_list = word_filter(seg_list, pos)
doc_list.append(filter_list)
这种读取TXT的方式一直都有问题,一直没有找到解决的方法。下面的就可以
with open('../data/***.txt','r',encoding='utf-8') as f:
read = f.read().splitlines()
for row in read:
print(row)
老是遇见Python读写文本编码问题,有个包可以较好的解决问题:codecs
from gensim.corpora import WikiCorpus
import jieba
from langconv import *
import codecs
zhwiki = 'zhwiki-latest-pages-articles.xml.bz2'
strs = []
i = 0
f = codecs.open('../data/zhiwiki.txt','a','utf-8')
wiki = WikiCorpus(zhwiki,lemmatize=False,dictionary={})
for text in wiki.get_texts():
for sen in text:
sen = Converter('zh-hans').convert(sen)
sen_list = list(jieba.cut(sen))
for s in sen_list:
strs.append(str(s))
tmp = ' '.join(strs)
f.write(tmp+'\n')
strs = []
i = i + 1
if(i % 200 == 0):
print('save'+str(i)+'article')
f.close()