import jieba
import jieba.analyse
import codecs
import pandas as pd
import numpy as np
rows=pd.read_csv('/home/kesci/input/stockpredic_15501/20190506_NEWS.csv',header=0,encoding='utf-8',dtype=str)
from string import punctuation
import re
punc = punctuation + u'.,;《》?!“”‘’@#¥%…&×()——+【】{};;●,。&~、|\s::'
def re_bs(x):
line = x
line = re.sub(r"[{}]+".format(punc),"",str(line))
line = re.sub(r'(\d)',"", line)
x = line
return x
#去掉数字和标点
rows['content']=rows['content'].apply(re_bs)
stopwords = [line.strip() for line in codecs.open('/home/kesci/哈工大停用词表.txt', 'r', 'utf-8').readlines()]
segments = []
def cutjieba(x):
words = jieba.cut(x)
splitedStr = ''
for word in words:
if word not in stopwords:
# 记录全局分词
segments.append({'word':word, 'count':1})
splitedStr += word + ' '
rows['content'].apply(cutjieba)#应用分词
dfSg = pd.DataFrame(segments)
dfWord = dfSg.groupby('word')['count'].sum()
dfWord = pd.DataFrame(dfWord)
dfWord.sort_values('count',ascending=False)
dfWord.to_csv()