记查找herf的爬虫代码

最新推荐文章于 2024-07-22 15:15:29 发布

双层小牛堡

最新推荐文章于 2024-07-22 15:15:29 发布

阅读量94

点赞数

文章标签： python 开发语言

本文链接：https://blog.csdn.net/m0_64180167/article/details/131135438

版权

    import requests
    from bs4 import  BeautifulSoup
    import pandas as pd
    # 定义解析页面函数，用来获取网易新闻热点排行Top10信息
    def get_news_info(url, headers):
        resposne = requests.get(url=url, headers=headers)
        resposne.encoding = 'utf-8'
        content = resposne.text
        # print(content)
        soup = BeautifulSoup(content,'lxml')  # 创建一个BeautifulSoup对象，获取页面正文
        all_news =soup.find('div',class_ ='mod_hot_rank').find('ul').find_all('li') # 获取网易新闻热点排行Top10内容
        news_list = []  # 创建空列表
        for news in all_news:
            # print(news)
            news_rank = news.find('em').get_text()  # 获取新闻排名
            news_title =news.find('a').get_text()  # 获取新闻标题
            # print(news_title)
            posts_num = news.find('span').get_text() # 获取新闻跟帖数
            # print(posts_num)
            news_url = news.a['href']  # 获取新闻链接   获取href的指令
            # print(news_url)
            news_list.append(news_rank+news_title+posts_num+news_url)  # 把每条新闻的排名、标题、跟帖数和链接添加到一个列表中，再追加到一个大列表中
        return news_list

    def save_as_csv(new_list):
        df=pd.DataFrame(new_list)
        df.to_csv('/home/qingjiao/news.csv', index=False, header=False)
        print('写入完成')
    # 程序入口
    if __name__ == '__main__':
        url = 'https://news.163.com/'  # 网易新闻首页链接
        # 定义请求头信息
        headers = {
            'User-Agent':
                'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/109.0'
        }
        news_list =get_news_info(url,headers) # 调用爬虫方法，获取网易新闻热点排行Top10
        print(news_list)  # 输出网易新闻热点排行Top10信息
        save_as_csv(news_list)