山东大学舆情分析系统算法开发进度展示
本次完成了基础的分词模块、分词统计模块的开发
分词模块使用了pkuseg包进行分词处理,并使用了保留字、停用词、无关词等做数据的清洗。停用词使用了网络上若干停用词包的整合,保留字和无关词为手动添加,后续考虑将加入保留字和无关词数据库,以便管理员增删。
目前存在分词过程耗费事件较长,分词后未被剔除的无关词较多,统计功能不完善等问题。
后附代码。
MyNLP.py
import pkuseg
lexicon = ["学伴", "学伴计划", "深表歉意", "不负", "不严", "不存在"]
tags = ['n', 'v', 'a']#, 'd'
stopwords = []
irrelevantwords = []
f = open("C:\\Users\\PC\\.pkuseg\\stopwords.txt", "r", encoding = "utf-8")
for line in f:
stopwords.append(line[:-1])
f.close()
f = open("C:\\Users\\PC\\.pkuseg\\IrrelevantWords.txt", "r", encoding = "utf-8")
for line in f:
irrelevantwords.append(line[:-1])
f.close()
#信息-关键词 结构体
class Info_kw:
def __init__(self, href, text, time, _from):
self.href = href
self.text = text
self.time = time
self._from = _from
self.KW = []
self.NLP()
def NLP(self):
seg = pkuseg.pkuseg(model_name = "news", user_dict = lexicon, postag = True)
#分词
rsts = seg.cut(self.text)
self.temp = []
#去重
for i in rsts:
if not i[0] in self.temp and i[1] in tags:
self.temp.append(i[0])
#去停用词,去无关词
for i in self.temp:
if i not in stopwords and i not in irrelevantwords:
self.KW.append(i)
def getKW(self):
return self.KW
WordsCount.py
import MyNLP as NLP
def InfoSet_Comput(data):
infoSet = []
info_count = {}
for i in data:
infoSet.append(NLP.Info_kw(i[0], i[1], i[2], i[3]))
temp_CountSet = {}
for i in infoSet:
for j in i.KW:
if j in temp_CountSet:
temp_CountSet[j] += 1
else:
temp_CountSet[j] = 1
info_count = sorted(temp_CountSet.items(), key = lambda t : t[1], reverse = True)
print(info_count)
Count_Test.py
import WordsCount as WC
path = "F:\\VS_Projects\\PythonApplication1\\data.csv"
def get_data():
data = []
f = open(path, "r")
for line in f:
data.append(line.split(","))
f.close()
return data
WC.InfoSet_Comput(get_data())