导入模块包
import numpy as np
import pandas as pd
import jieba
import jieba.analyse
import codecs
import os
import re
设置当前工作路径
os.chdir(r'文件路径1')
pd.set_option('max_colwidth',500)
载入数据
rows=pd.read_excel('某某.xlsx',dtype=str)
segments = []
for index, row in rows.iterrows():
content = row[4]
words = jieba.analyse.textrank(content, topK=50,withWeight=False,allowPOS=('ns', 'n', 'vn', 'v'))
splitedStr = ''
for word in words:
segments.append({'word':word, 'count':1})
splitedStr += word + ' '
dfSg = pd.DataFrame(segments)
词频统计
dfWord = dfSg.groupby('word')['count'].sum()