import jieba
class fenlei:
def __init__(self,kwpath="keyword.txt",stpath="stop.txt",top=2):
self.cipin = {}
self.fenci = {}
self.stop = [word.decode("gbk").strip("\r\n") for word in open(stpath)] #读取听词列表
self.keyword = [word.strip("\r\n") for word in open(kwpath)] #读取关键词列表
self.index = {}
self.result = {}
self.top = top
def delete_stops(self,lists): #去除停词
lists = list(set(lists)-set(self.stop))
return lists
def sort_dict(self,dicts): #字典按value排序
dict_list = []
dicts = sorted([(v, k) for k, v in dicts.items()], reverse=True)
for i in dicts:
dict_list.append(i[1])
return dict_list
def count_cipin(self,lists): #统计词频
for i in lists:
if(self.cipin.get(i)):
self.cipin[i] += 1
else:
self.cipin[i] = 1
def run(self): #主流程
for kw in self.keyword: #循环关键词列表
cut = jieba.cut(kw) #分词
cut = self.delete_stops(cut) #去除停词
self.count_cipin(cut) #统计词频
self.fenci[kw] = cut #把分词结果存起来,避免再次分词
for kw in self.fenci: #循环分词结果
index = ''
index_dict = {}
for i in self.fenci[kw]: #循环分词后的每个词
index_dict[i] = self.cipin[i] #获得该词词频
index_list = self.sort_dict(index_dict) #排序分词结果
for i in index_list[0:self.top]: #取出词频最高的两个词
index += i #组合索引词
if(self.index.get(index)): #建立倒排索引
self.index[index].append(kw)
else:
self.index[index] = [kw]
for i in self.index: #统计每个索引的词数量
self.result[i] = len(self.index[i])
lists = self.sort_dict(self.result) #对索引词数量进行排序
for index in lists: #安装索引词数量排序输出关键词
for kw in self.index[index]:
print index,"---",kw
###############################################################
# a=fenlei(kwpath,stpath,top) 实例化类,分类对象有三个参数 #
# kwpath 是 关键词文件路径,默认为 keyword.txt #
# stpath 是 停词文件路径,默认为stop.txt #
# top 是 索引词取几个词,越多分的越细 #
# run() 是分类类的主流程方法,调用此方法开始分类 #
###############################################################
class fenlei:
def __init__(self,kwpath="keyword.txt",stpath="stop.txt",top=2):
self.cipin = {}
self.fenci = {}
self.stop = [word.decode("gbk").strip("\r\n") for word in open(stpath)] #读取听词列表
self.keyword = [word.strip("\r\n") for word in open(kwpath)] #读取关键词列表
self.index = {}
self.result = {}
self.top = top
def delete_stops(self,lists): #去除停词
lists = list(set(lists)-set(self.stop))
return lists
def sort_dict(self,dicts): #字典按value排序
dict_list = []
dicts = sorted([(v, k) for k, v in dicts.items()], reverse=True)
for i in dicts:
dict_list.append(i[1])
return dict_list
def count_cipin(self,lists): #统计词频
for i in lists:
if(self.cipin.get(i)):
self.cipin[i] += 1
else:
self.cipin[i] = 1
def run(self): #主流程
for kw in self.keyword: #循环关键词列表
cut = jieba.cut(kw) #分词
cut = self.delete_stops(cut) #去除停词
self.count_cipin(cut) #统计词频
self.fenci[kw] = cut #把分词结果存起来,避免再次分词
for kw in self.fenci: #循环分词结果
index = ''
index_dict = {}
for i in self.fenci[kw]: #循环分词后的每个词
index_dict[i] = self.cipin[i] #获得该词词频
index_list = self.sort_dict(index_dict) #排序分词结果
for i in index_list[0:self.top]: #取出词频最高的两个词
index += i #组合索引词
if(self.index.get(index)): #建立倒排索引
self.index[index].append(kw)
else:
self.index[index] = [kw]
for i in self.index: #统计每个索引的词数量
self.result[i] = len(self.index[i])
lists = self.sort_dict(self.result) #对索引词数量进行排序
for index in lists: #安装索引词数量排序输出关键词
for kw in self.index[index]:
print index,"---",kw
###############################################################
# a=fenlei(kwpath,stpath,top) 实例化类,分类对象有三个参数 #
# kwpath 是 关键词文件路径,默认为 keyword.txt #
# stpath 是 停词文件路径,默认为stop.txt #
# top 是 索引词取几个词,越多分的越细 #
# run() 是分类类的主流程方法,调用此方法开始分类 #
###############################################################
a=fenlei().run()