爬取百度热榜新闻概括

最新推荐文章于 2024-07-01 11:04:11 发布

Z0o1010

最新推荐文章于 2024-07-01 11:04:11 发布

阅读量149

点赞数

分类专栏： Spider_Practise

本文链接：https://blog.csdn.net/weixin_48732879/article/details/111308928

版权

Spider_Practise 专栏收录该内容

8 篇文章 3 订阅

订阅专栏

import requests
from bs4 import BeautifulSoup


def get_html(url, headers):
    r = requests.get(url, headers=headers)
    r.encoding = r.apparent_encoding
    return r.text


def get_pages(html):
    global s
    soup = BeautifulSoup(html, 'html.parser')
    all_topics = soup.find_all('tr')[1:]
    for each_topic in all_topics:
        topic_times = each_topic.find('td', class_='last')  # 搜索指数
        topic_rank = each_topic.find('td', class_='first')  # 排名
        topic_name = each_topic.find('td', class_='keyword')  # 标题目
        if topic_rank != None and topic_name != None and topic_times != None:
            topic_rank = each_topic.find('td', class_='first').get_text().replace(' ', '').replace('\n', '')
            topic_name = each_topic.find('td', class_='keyword').get_text().replace(' ', '').replace('\n', '')
            topic_times = each_topic.find('td', class_='last').get_text().replace(' ', '').replace('\n', '')
            tplt = "排名：{0:^4} 标题：{1:{3}^15} 热度：{2:^8}"
            a = [tplt.format(topic_rank, topic_name, topic_times, chr(12288))]
            with open('百度热榜.txt', 'a', encoding='utf-8') as f:
                f.write(str(a).replace("['", "").replace("']", "\n"))
            s = s + topic_name.replace('search', '') + '\n'


url = 'http://top.baidu.com/buzz?b=1&fr=20811'
headers = {'User-Agent': 'Mozilla/5.0'}
html = get_html(url, headers)
s = ''
get_pages(html)

Z0o1010

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
爬取百度热榜新闻概括

import requestsfrom bs4 import BeautifulSoupdef get_html(url, headers): r = requests.get(url, headers=headers) r.encoding = r.apparent_encoding return r.textdef get_pages(html): global s soup = BeautifulSoup(html, 'html.parser')
复制链接

扫一扫

专栏目录