关于链接提取文章关键字的实现- python

最新推荐文章于 2022-11-22 11:31:57 发布

东哥爱编程

最新推荐文章于 2022-11-22 11:31:57 发布

阅读量326

点赞数

分类专栏：机器学习 python 文章标签： python 开发语言

本文链接：https://blog.csdn.net/monk96/article/details/125018678

版权

python 同时被 2 个专栏收录

50 篇文章 2 订阅

订阅专栏

机器学习

34 篇文章 0 订阅

订阅专栏

今天刷新闻的时候，想到能不能有个软件可以快速帮我总结新闻的要点，我直接通过关键字就能大概浏览文章的内容。说干就干

# -*- coding: utf-8 -*-
import jieba
from sklearn.feature_extraction.text import TfidfVectorizer
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import re
import requests
from bs4 import BeautifulSoup
def model(s):
   
    tf = TfidfVectorizer(stop_words=None, token_pattern='(?u)\\b\\w\\w+\\b', max_features=30)
    weight = tf.fit_transform(s).toarray()
    word = tf.get_feature_names()
    vocab = tf.vocabulary_.items()
    vocab = sorted(vocab, key= lambda v: v)
    dir = {}
    print(weight)
    for i in range(len(weight)):
        for j in range(len(word)):
            dir[word[j]] = weight[i][j]
    
    print(dir)
    return dir
def showImg(weight):
    wordClould = WordCloud(font_path='./SimHei.ttf', background_color='white', max_font_size=70)
    wordClould.fit_words(weight)
    plt.imshow(wordClould)
    plt.xticks([])
    plt.yticks([])
    plt.show()
def load_text():
    article = load_url_text()
    #with open('./article.txt', encoding= "utf-8") as f:
        #article = f.read()
    article = article.strip()
    article = re.sub("[1-9\，\。\：\.\、\·\—\“\”]\\n","", article)
    seg_list = jieba.cut(article, cut_all = False)
    
    arr = []
    tmp = ' '
    for i in seg_list:
        arr.append(i)
        
    tmp = tmp.join(arr)
    return tmp
def load_url_text():
    headers = {
        "cookie":"s_v_web_id=verify_l3paknge_fo4iQ6FK_Sl6o_4gXi_Bq1M_QsNhnzjQHYHr; local_city_cache=%E5%8E%A6%E9%97%A8; csrftoken=50ebce478fdb72f8e18eff8fdeaf6727; _tea_utm_cache_24=undefined; tt_webid=7091827241689515551; MONITOR_WEB_ID=2897349e-681b-4319-9b25-f56e05444364; ttwid=1%7CkKdmscTcdpXdUa43CSJOJKkbwJHkrqqnvLSfqmnr92Y%7C1653720989%7C4a2c329e498e864641ddb482670e608110c9461cc101bcdb46a10a586db340be; tt_scid=-NPtai0Iso3HxyvBeDq1HsEGXtheapKHGt1n1xxQOs41rJHZ4Cba1FZTcbRZm8PN2fbb",
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36"}

    url = "https://www.toutiao.com/article/7102612270106083881/?log_from=96bbf32ce5e7c_1653720623948"
    html_text = requests.get(url, headers = headers)
    html_text.encoding='utf-8'
    
    result = re.sub('<[^<]+?>', '', html_text.text).replace('\n', '').strip()
    result = re.sub('[0-9A-Za-z\，\。\：\.\、\·\—\“\”\!\&\!\%\\]\\[\{\}\_]', "", result)
    
    
    print(result)
    return result
if __name__ == '__main__':
    
    tmp = load_text()
    weight = model([tmp])
    showImg(weight)

在这里插入图片描述

东哥爱编程

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
打赏
0
评论
关于链接提取文章关键字的实现- python

今天刷新闻的时候，想到能不能有个软件可以快速帮我总结新闻的要点，我直接通过关键字就能大概浏览文章的内容。说干就干# -*- coding: utf-8 -*-import jiebafrom sklearn.feature_extraction.text import TfidfVectorizerfrom wordcloud import WordCloudimport matplotlib.pyplot as pltimport reimport requestsfrom bs4 imp
复制链接

扫一扫