import chardet
if __name__ == '__main__':
f = file2file()
s = '中国是个好地方,我住在这里。'
stopwords = set(sum(f.readtxt('../data/HITstopwords.txt'), []))
# 查看s字符集
s_charset = chardet.detect(s)
# jieba
cut = jieba.lcut(s)
# 下面这句一直会报错
# cut_charset = chardet.detect(cut[0])
# # 编码处理部分:Begin
# k =[]
# for each in cut:
# k.append(each.encode('utf-8'))
# # 编码处理部分:End
# 去停用词,注意把cut换成k
cut__stop_data = [word for word in cut if word not in stopwords]
# cut__stop_data = [word for word in k if word not in stopwords]
# 写入本地
open('test.txt', 'w').write(' '.join(cut__stop_data))
print('------------------Run over-----------------')