Python + difflib: 比对新闻消息获取热点

        获取热点新闻有很多种方法,比如直接获取新闻网站的热点栏目。热点栏目可能是编辑推荐的,具有较大主观性。稍微复杂点的可以通过提取热点词组。通过对每条消息分词,比对分析得出出现最频繁的词典,包含热门词典词汇的消息即是热点新闻。

        此外,还可以通过对比新闻消息相似度来提取热点。这种方法综合以上两者的优点。在比对语句相似度的过程中,隐性提及热门词汇。除去大规模比对消息外,可以对热点栏目新闻进行相似度比较,比较快速有效地生成热点的预览。

        热点的产生最少是二维的,即成为热点必要条件是最少出现两条相似度较高的消息。直接比较消息相似度的情况下,维数越高,耗时越久。

"""
@author: MR.N
@created: 2021/9/3 下午8:46
@file: main_tk
@project: Csdn
@description: None
@blog: https://blog.csdn.net/qq_21264377

"""


from tkinter import scrolledtext
from difflib import SequenceMatcher


hot_news = []
recent_news = []
news_weights = {}
news_sources = {}


# 初始化,event = tkinter.scrolledtext.ScrolledText(...)略...


def get_hot_news(news_list, event=None):
    global hot_news
    global recent_news
    global news_weights
    global news_sources
    print('[count]', len(news_list))
    if news_list is not None and len(news_list) > 0:
        for item in news_list:
            # print(item)
            if item in recent_news:
                for diff in recent_news:
                    if item == diff and item.get_source() != diff.get_source():
                        news_sources[item.__hash__()].append(diff.get_source())
                        if news_weights.get(item.__hash__(), 0) == 0:
                            news_weights[item.__hash__()] = 2
                            news_sources[item.__hash__()] = [item.get_source(), diff.get_source()]
                            if event is not None:
                                event.config(state=tk.NORMAL)
                                event.delete(0., tk.END)
                                event.insert(0.,
                                             f'\n  [HOT] {diff.get_title()} {news_sources.get(diff.__hash__())}\n')
                                event.config(state=tk.DISABLED)
                                time.sleep(.1)
                        else:
                            if item.get_source() not in news_sources[diff.__hash__()]:
                                news_weights[item.__hash__()] += 1
                                news_sources[item.__hash__()].append(item.get_source())
                                if event is not None:
                                    event.config(state=tk.NORMAL)
                                    event.delete(0., tk.END)
                                    event.insert(0.,
                                                 f'\n  [TOP] {diff.get_title()} {news_sources.get(item.__hash__())}\n')
                                    event.config(state=tk.DISABLED)
                                    time.sleep(.1)
                        if news_weights.get(diff.__hash__(), 0) > 1 and diff not in hot_news:
                            hot_news.append(item)
                        break
            else:
                if len(recent_news) < 1:
                    recent_news.append(item)
                else:
                    sim = False
                    for diff in recent_news:
                        sm = SequenceMatcher(None, item.get_title(), diff.get_title())
                        # print(sm.ratio())
                        if .5001 <= sm.ratio() <= 1:
                            sim = True
                            if item.get_source() != diff.get_source():
                                if news_weights.get(diff.__hash__(), 0) == 0:
                                    news_weights[diff.__hash__()] = 2
                                    news_sources[diff.__hash__()] = [item.get_source(), diff.get_source()]
                                    if event is not None:
                                        event.config(state=tk.NORMAL)
                                        event.delete(0., tk.END)
                                        event.insert(0.,
                                                     f'\n  [HOT] {diff.get_title()} {news_sources.get(diff.__hash__())}\n')
                                        event.config(state=tk.DISABLED)
                                        time.sleep(.1)
                                else:
                                    if item.get_source() not in news_sources[diff.__hash__()]:
                                        news_weights[diff.__hash__()] += 1
                                        news_sources[diff.__hash__()].append(item.get_source())
                                        if event is not None:
                                            event.config(state=tk.NORMAL)
                                            event.delete(0., tk.END)
                                            event.insert(0.,
                                                         f'\n  [TOP] {diff.get_title()} {news_sources.get(diff.__hash__())}\n')
                                            event.config(state=tk.DISABLED)
                                            time.sleep(.1)
                                if news_weights.get(diff.__hash__(), 0) > 1 and diff not in hot_news:
                                    hot_news.append(diff)
                                    print(diff.get_title(), item.get_source(), diff.get_source())
                            break
                    if not sim:
                        recent_news.append(item)

在Tkinter的ScrolledText上列举热门消息:

def list_news(news_list, event=None):
    global news_sources
    if event is not None:
        event.config(state=tk.NORMAL)
        event.delete(0., tk.END)
        event.config(state=tk.DISABLED)
        time.sleep(.1)
    for item in news_list:
        if event is not None and news_weights.get(item.__hash__(), 0) == 2:
            # 消息源权重为2的热门消息
            event.config(state=tk.NORMAL)
            event.insert(0.,
                         f'\n  [HOT] {item.get_title()} {news_sources[item.__hash__()]}\n')
            event.config(state=tk.DISABLED)
            time.sleep(.1)
    for item in news_list:
        if event is not None and news_weights.get(item.__hash__(), 0) > 2:
            # 消息源权重大于2的热门消息
            event.config(state=tk.NORMAL)
            event.insert(0.,
                         f'\n  [TOP] {item.get_title()} {news_sources[item.__hash__()]}\n')
            event.config(state=tk.DISABLED)
            time.sleep(.1)

“news_list”中的“item”所指向类的定义包含于hotspot.py,其代码如下:

"""
@author: MR.N
@created: 2021/9/18 22:38
@blog: https://blog.csdn.net/qq_21264377
@description:   none.

"""

__author__ = 'MR.N'


class Hotspot:
    def __init__(self, title, source=None):
        self.title = title
        self.source = source

    def __del__(self):
        self.title = None
        self.source = None

    def get_title(self):
        return self.title

    def get_source(self):
        return self.source

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值