关于链接提取文章关键字的实现- python

今天刷新闻的时候,想到能不能有个软件可以快速帮我总结新闻的要点,我直接通过关键字就能大概浏览文章的内容。说干就干

# -*- coding: utf-8 -*-
import jieba
from sklearn.feature_extraction.text import TfidfVectorizer
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import re
import requests
from bs4 import BeautifulSoup
def model(s):
   
    tf = TfidfVectorizer(stop_words=None, token_pattern='(?u)\\b\\w\\w+\\b', max_features=30)
    weight = tf.fit_transform(s).toarray()
    word = tf.get_feature_names()
    vocab = tf.vocabulary_.items()
    vocab = sorted(vocab, key= lambda v: v)
    dir = {}
    print(weight)
    for i in range(len(weight)):
        for j in range(len(word)):
            dir[word[j]] = weight[i][j]
    
    print(dir)
    return dir
def showImg(weight):
    wordClould = WordCloud(font_path='./SimHei.ttf', background_color='white', max_font_size=70)
    wordClould.fit_words(weight)
    plt.imshow(wordClould)
    plt.xticks([])
    plt.yticks([])
    plt.show()
def load_text():
    article = load_url_text()
    #with open('./article.txt', encoding= "utf-8") as f:
        #article = f.read()
    article = article.strip()
    article = re.sub("[1-9\,\。\:\.\、\·\—\“\”]\\n","", article)
    seg_list = jieba.cut(article, cut_all = False)
    
    arr = []
    tmp = ' '
    for i in seg_list:
        arr.append(i)
        
    tmp = tmp.join(arr)
    return tmp
def load_url_text():
    headers = {
        "cookie":"s_v_web_id=verify_l3paknge_fo4iQ6FK_Sl6o_4gXi_Bq1M_QsNhnzjQHYHr; local_city_cache=%E5%8E%A6%E9%97%A8; csrftoken=50ebce478fdb72f8e18eff8fdeaf6727; _tea_utm_cache_24=undefined; tt_webid=7091827241689515551; MONITOR_WEB_ID=2897349e-681b-4319-9b25-f56e05444364; ttwid=1%7CkKdmscTcdpXdUa43CSJOJKkbwJHkrqqnvLSfqmnr92Y%7C1653720989%7C4a2c329e498e864641ddb482670e608110c9461cc101bcdb46a10a586db340be; tt_scid=-NPtai0Iso3HxyvBeDq1HsEGXtheapKHGt1n1xxQOs41rJHZ4Cba1FZTcbRZm8PN2fbb",
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36"}

    url = "https://www.toutiao.com/article/7102612270106083881/?log_from=96bbf32ce5e7c_1653720623948"
    html_text = requests.get(url, headers = headers)
    html_text.encoding='utf-8'
    
    result = re.sub('<[^<]+?>', '', html_text.text).replace('\n', '').strip()
    result = re.sub('[0-9A-Za-z\,\。\:\.\、\·\—\“\”\!\&\!\%\\]\\[\{\}\_]', "", result)
    
    
    print(result)
    return result
if __name__ == '__main__':
    
    tmp = load_text()
    weight = model([tmp])
    showImg(weight)
        

在这里插入图片描述

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

东哥爱编程

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值