获取热点新闻有很多种方法,比如直接获取新闻网站的热点栏目。热点栏目可能是编辑推荐的,具有较大主观性。稍微复杂点的可以通过提取热点词组。通过对每条消息分词,比对分析得出出现最频繁的词典,包含热门词典词汇的消息即是热点新闻。
此外,还可以通过对比新闻消息相似度来提取热点。这种方法综合以上两者的优点。在比对语句相似度的过程中,隐性提及热门词汇。除去大规模比对消息外,可以对热点栏目新闻进行相似度比较,比较快速有效地生成热点的预览。
热点的产生最少是二维的,即成为热点必要条件是最少出现两条相似度较高的消息。直接比较消息相似度的情况下,维数越高,耗时越久。
"""
@author: MR.N
@created: 2021/9/3 下午8:46
@file: main_tk
@project: Csdn
@description: None
@blog: https://blog.csdn.net/qq_21264377
"""
from tkinter import scrolledtext
from difflib import SequenceMatcher
hot_news = []
recent_news = []
news_weights = {}
news_sources = {}
# 初始化,event = tkinter.scrolledtext.ScrolledText(...)略...
def get_hot_news(news_list, event=None):
global hot_news
global recent_news
global news_weights
global news_sources
print('[count]', len(news_list))
if news_list is not None and len(news_list) > 0:
for item in news_list:
# print(item)
if item in recent_news:
for diff in recent_news:
if item == diff and item.get_source() != diff.get_source():
news_sources[item.__hash__()].append(diff.get_source())
if news_weights.get(item.__hash__(), 0) == 0:
news_weights[item.__hash__()] = 2
news_sources[item.__hash__()] = [item.get_source(), diff.get_source()]
if event is not None:
event.config(state=tk.NORMAL)
event.delete(0., tk.END)
event.insert(0.,
f'\n [HOT] {diff.get_title()} {news_sources.get(diff.__hash__())}\n')
event.config(state=tk.DISABLED)
time.sleep(.1)
else:
if item.get_source() not in news_sources[diff.__hash__()]:
news_weights[item.__hash__()] += 1
news_sources[item.__hash__()].append(item.get_source())
if event is not None:
event.config(state=tk.NORMAL)
event.delete(0., tk.END)
event.insert(0.,
f'\n [TOP] {diff.get_title()} {news_sources.get(item.__hash__())}\n')
event.config(state=tk.DISABLED)
time.sleep(.1)
if news_weights.get(diff.__hash__(), 0) > 1 and diff not in hot_news:
hot_news.append(item)
break
else:
if len(recent_news) < 1:
recent_news.append(item)
else:
sim = False
for diff in recent_news:
sm = SequenceMatcher(None, item.get_title(), diff.get_title())
# print(sm.ratio())
if .5001 <= sm.ratio() <= 1:
sim = True
if item.get_source() != diff.get_source():
if news_weights.get(diff.__hash__(), 0) == 0:
news_weights[diff.__hash__()] = 2
news_sources[diff.__hash__()] = [item.get_source(), diff.get_source()]
if event is not None:
event.config(state=tk.NORMAL)
event.delete(0., tk.END)
event.insert(0.,
f'\n [HOT] {diff.get_title()} {news_sources.get(diff.__hash__())}\n')
event.config(state=tk.DISABLED)
time.sleep(.1)
else:
if item.get_source() not in news_sources[diff.__hash__()]:
news_weights[diff.__hash__()] += 1
news_sources[diff.__hash__()].append(item.get_source())
if event is not None:
event.config(state=tk.NORMAL)
event.delete(0., tk.END)
event.insert(0.,
f'\n [TOP] {diff.get_title()} {news_sources.get(diff.__hash__())}\n')
event.config(state=tk.DISABLED)
time.sleep(.1)
if news_weights.get(diff.__hash__(), 0) > 1 and diff not in hot_news:
hot_news.append(diff)
print(diff.get_title(), item.get_source(), diff.get_source())
break
if not sim:
recent_news.append(item)
在Tkinter的ScrolledText上列举热门消息:
def list_news(news_list, event=None):
global news_sources
if event is not None:
event.config(state=tk.NORMAL)
event.delete(0., tk.END)
event.config(state=tk.DISABLED)
time.sleep(.1)
for item in news_list:
if event is not None and news_weights.get(item.__hash__(), 0) == 2:
# 消息源权重为2的热门消息
event.config(state=tk.NORMAL)
event.insert(0.,
f'\n [HOT] {item.get_title()} {news_sources[item.__hash__()]}\n')
event.config(state=tk.DISABLED)
time.sleep(.1)
for item in news_list:
if event is not None and news_weights.get(item.__hash__(), 0) > 2:
# 消息源权重大于2的热门消息
event.config(state=tk.NORMAL)
event.insert(0.,
f'\n [TOP] {item.get_title()} {news_sources[item.__hash__()]}\n')
event.config(state=tk.DISABLED)
time.sleep(.1)
“news_list”中的“item”所指向类的定义包含于hotspot.py,其代码如下:
"""
@author: MR.N
@created: 2021/9/18 22:38
@blog: https://blog.csdn.net/qq_21264377
@description: none.
"""
__author__ = 'MR.N'
class Hotspot:
def __init__(self, title, source=None):
self.title = title
self.source = source
def __del__(self):
self.title = None
self.source = None
def get_title(self):
return self.title
def get_source(self):
return self.source