爬取搜狐新闻

基于selenium实现

from selenium import webdriver
from selenium.webdriver.edge.service import Service
from selenium.webdriver.common.by import By
import time
from my_package import my_excel


if __name__ == '__main__':
    target = [
        [
            '健康',
            'https://www.sohu.com/xchannel/TURBd01EQXdOekUx?pcm=202.260_11_0.0.0&scm=1103.plate:260:0.0.2.0&spm=smpc.channel_218.block1_78_z043c4_1_nav.14.1714097794024eat2j8S_715'
        ],
        [
            '历史',
            'https://history.sohu.com/?pcm=202.260_12_0.0.0&scm=1103.plate:260:0.0.2.0&spm=smpc.channel_218.block1_78_z043c4_1_nav.15.1714099193723qo2yjrH_715'
        ],
        [
            '文化',
            'https://cul.sohu.com/?pcm=202.260_15_0.0.0&scm=thor.260_14-201000.0.10005.0&spm=smpc.channel_230.block1_78_su9k48_1_nav.18.17140993186239v9IKaw_417'
        ],
        [
            '时政',
            'https://www.sohu.com/xchannel/tag?key=%E6%96%B0%E9%97%BB-%E6%97%B6%E6%94%BF&scm=10001.457_14-201000.0.10005.0&spm=smpc.channel_258.block2_225_ZJhqAx_1_nav.2.1714108588199X6MERGz_1090'
        ],
        [
            '国际',
            'https://www.sohu.com/xchannel/tag?key=%E6%96%B0%E9%97%BB-%E5%9B%BD%E9%99%85&scm=10001.457_14-201000.0.10005.0&spm=smpc.channel_258.block2_225_ZJhqAx_1_nav.3.1714108588199X6MERGz_1090'
        ],
        [
            '教育',
            'https://learning.sohu.com/?pcm=202.260_5_0.0.0&scm=thor.260_14-201000.0.10005.0&spm=smpc.channel_230.block1_78_su9k48_1_nav.7.1714107008420d73fzur_269'
        ],
        [
            '科技',
            'https://it.sohu.com/?pcm=202.260_7_0.0.0&scm=thor.260_14-201000.0.10005.0&spm=smpc.channel_230.block1_78_su9k48_1_nav.9.1714108947776SPOe6j4_743'
        ],
        [
            '财经',
            'https://business.sohu.com/?pcm=202.260_8_0.0.0&scm=thor.260_14-201000.0.10005.0&spm=smpc.channel_218.block1_78_z043c4_1_nav.10.1714108973726jN9YaK7_499'
        ],
        [
            '母婴',
            'https://baobao.sohu.com/?pcm=202.260_10_0.0.0&scm=thor.260_14-201000.0.10005.0&spm=smpc.channel_218.block1_78_z043c4_1_nav.12.1714109000441vonk1wA_704'
        ]
    ]

    # 创建Edge浏览器WebDriver实例
    edge_driver_path = r"D:\develop\software\tools\msedgedriver.exe"  # 此处需要填写自己的edgeDriver路径
    s = Service(edge_driver_path)
    browser = webdriver.Edge(service=s)
    browser.maximize_window()
    for j in range(2):
        try:
            data = []
            browser.get(target[j][1])
            print("正在加载第{}个主题".format(j+1))
            time.sleep(0.5)
            for i in range(20):    # 这里是你需要下拉多少次,每次500px
                px = str(i * 500)
                js = 'var q=document.documentElement.scrollTop={}'.format(px)
                browser.execute_script(js)
                time.sleep(0.3)
                if i%10==0:
                    print('第{}次下拉成功'.format(i+1))
            frame = browser.find_element(By.CSS_SELECTOR, 'div.recommend-content-wrap')
            divs = frame.find_elements(By.CSS_SELECTOR, "div[data-spm-type='resource']")
            q = 0
            for div in divs:
                try:
                    div_content = div.find_element(By.CSS_SELECTOR, 'div.item-text-content')

                    title = div_content.find_element(By.CSS_SELECTOR, 'div.item-text-content-title').text

                    content = div_content.find_element(By.CSS_SELECTOR, 'div.item-text-content-description').text

                    div_list = div.find_element(By.CSS_SELECTOR, 'div.extra-info-list')

                    div_list_a = div_list.find_element(By.TAG_NAME, 'a')

                    author = div_list_a.find_element(By.TAG_NAME, 'span').text

                    the_time = div_list.find_element(By.CSS_SELECTOR, 'span.extra-info-item').text

                    data.append([target[j][0], title, content, author, the_time])
                    print('第{}个新闻插入成功'.format(q+1))
                except Exception:
                    pass
                q = q + 1
            my_excel.write_to_excel_all(data, '搜狐新闻1.xlsx')
            print(target[j][0] + '全部写入成功')

        except Exception:
            print(target[j][0]+'未找到')

    browser.quit()

一定要使用对应你浏览器版本的驱动,否则浏览器肯定弹不出来!!!
有需要可以加学长微信
EchoYouChu

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值