import jieba
import xlrd
import jieba.analyse
def stopwordslist(filepath):
stopwords = [line.strip() for line in open(filepath, 'r', encoding='utf-8').readlines()]
return stopwords
def fenci(content):
table = content.sheets()[0]
nrows = table.nrows#获取行数
row1=1
cell=""
final = ""
while row1<nrows:
cell = table.cell(row1,0).value
fenci=jieba.cut(cell)
for seg in fenci:
if seg not in stopwords and len(seg)>0:
final+=seg+" "
final+=""
final+='\n'
# print(row1,final)
row1 += 1
return final
jieba.load_userdict("C:\\Users\\Administrator\\Desktop\\userdic.txt")#导入自定义词典,自定义词典编码方式为UTF-8
stopwords=stopwordslist("C:\\Users\\Administrator\\Desktop\\stop.txt")#导入停止词典
content=xlrd.open_workbook("C:\\Users\\Administrator\\Desktop\\zhaopin_data.xlsx")#导入数据
final=fenci(content)
# print(final)
keywords = jieba.analyse.extract_tags(final,topK=200,withWeight=True,allowPOS=())
# print(keywords)
for item in keywords:
# if item[0] in ("SQL","Python","SAS"):
print(item[0], item[1]) # 输出关键词和相应的权重
#可根据输出的topK词语,再挑选一些加入停止词典中。
Python--使用jieba进行分词并计算词权重
最新推荐文章于 2024-03-22 12:59:16 发布