import jieba
import pandas as pd
from gensim import corpora
import os
from itertools import chain
from collections import Counter
#返回一个总的列表
def get_list(root):
stopwords=pd.read_csv("C:/stopwords.txt",index_col=False,quoting=3,sep="\t",names=['stopword'], encoding='utf-8')
# quoting=3全不引用
stopwords=stopwords['stopword'].values
paths=[]
try:
dirs=os.listdir(root)
print(dirs)
for dirx in dirs:
paths.append(root+'/'+dirx)
print(root+'/'+dirx)
except:
print('error')
all_list=[]
for path in paths:
df = pd.read_csv(path,encoding='gbk')
df = df.dropna()#删除缺失数据
content=df.content.values.tolist()
sentence=[]
#分词
for line in content:
try:
segs=jieba.lcut(line)
segs = list(filter(lambda x:len(x)>1, segs))
segs = list(filter(lambda x:x not in stopwords, segs))
sentence.append(segs)
except:
print(line)
continue
sentence=list(chain.from_iterable(sentence))#转换为一维的列表
all_list.append(sentence)
return all_list
#写入文件,返回词袋模型,格式:词,序号,词频
def write(all_list):
root="C:/NLP_project/NLP_project/result/"
corpus = set(list(chain.from_iterable(all_list)))
print(len(corpus))
corpus_dict = dict(zip(corpus, range(len(corpus))))
vector=[]
i=0 #计数
for sentence in sentences:
line=root+str(i)+'.txt'
f=open(line,'w+')
vec = []
for key in corpus_dict.keys():
if key in sentence:
f.write(key+','+str(corpus_dict[key])+','+str(sentence.count(key))+'\n')
vec.append((corpus_dict[key], sentence.count(key)))
else:
vec.append((corpus_dict[key], 0))
f.write(key+','+str(corpus_dict[key])+','+'0'+'\n')
vec = sorted(vec, key= lambda x: x[0]) #利用第一个元素进行排序
vector.append(vec)
f.close
i=i+1
return vector
sentences=get_list("C:/NLP_project/NLP_project/train")
vec=write(sentences)