基于selenium实现
from selenium import webdriver
from selenium.webdriver.edge.service import Service
from selenium.webdriver.common.by import By
import time
from my_package import my_excel
if __name__ == '__main__':
target = [
[
'健康',
'https://www.sohu.com/xchannel/TURBd01EQXdOekUx?pcm=202.260_11_0.0.0&scm=1103.plate:260:0.0.2.0&spm=smpc.channel_218.block1_78_z043c4_1_nav.14.1714097794024eat2j8S_715'
],
[
'历史',
'https://history.sohu.com/?pcm=202.260_12_0.0.0&scm=1103.plate:260:0.0.2.0&spm=smpc.channel_218.block1_78_z043c4_1_nav.15.1714099193723qo2yjrH_715'
],
[
'文化',
'https://cul.sohu.com/?pcm=202.260_15_0.0.0&scm=thor.260_14-201000.0.10005.0&spm=smpc.channel_230.block1_78_su9k48_1_nav.18.17140993186239v9IKaw_417'
],
[
'时政',
'https://www.sohu.com/xchannel/tag?key=%E6%96%B0%E9%97%BB-%E6%97%B6%E6%94%BF&scm=10001.457_14-201000.0.10005.0&spm=smpc.channel_258.block2_225_ZJhqAx_1_nav.2.1714108588199X6MERGz_1090'
],
[
'国际',
'https://www.sohu.com/xchannel/tag?key=%E6%96%B0%E9%97%BB-%E5%9B%BD%E9%99%85&scm=10001.457_14-201000.0.10005.0&spm=smpc.channel_258.block2_225_ZJhqAx_1_nav.3.1714108588199X6MERGz_1090'
],
[
'教育',
'https://learning.sohu.com/?pcm=202.260_5_0.0.0&scm=thor.260_14-201000.0.10005.0&spm=smpc.channel_230.block1_78_su9k48_1_nav.7.1714107008420d73fzur_269'
],
[
'科技',
'https://it.sohu.com/?pcm=202.260_7_0.0.0&scm=thor.260_14-201000.0.10005.0&spm=smpc.channel_230.block1_78_su9k48_1_nav.9.1714108947776SPOe6j4_743'
],
[
'财经',
'https://business.sohu.com/?pcm=202.260_8_0.0.0&scm=thor.260_14-201000.0.10005.0&spm=smpc.channel_218.block1_78_z043c4_1_nav.10.1714108973726jN9YaK7_499'
],
[
'母婴',
'https://baobao.sohu.com/?pcm=202.260_10_0.0.0&scm=thor.260_14-201000.0.10005.0&spm=smpc.channel_218.block1_78_z043c4_1_nav.12.1714109000441vonk1wA_704'
]
]
# 创建Edge浏览器WebDriver实例
edge_driver_path = r"D:\develop\software\tools\msedgedriver.exe" # 此处需要填写自己的edgeDriver路径
s = Service(edge_driver_path)
browser = webdriver.Edge(service=s)
browser.maximize_window()
for j in range(2):
try:
data = []
browser.get(target[j][1])
print("正在加载第{}个主题".format(j+1))
time.sleep(0.5)
for i in range(20): # 这里是你需要下拉多少次,每次500px
px = str(i * 500)
js = 'var q=document.documentElement.scrollTop={}'.format(px)
browser.execute_script(js)
time.sleep(0.3)
if i%10==0:
print('第{}次下拉成功'.format(i+1))
frame = browser.find_element(By.CSS_SELECTOR, 'div.recommend-content-wrap')
divs = frame.find_elements(By.CSS_SELECTOR, "div[data-spm-type='resource']")
q = 0
for div in divs:
try:
div_content = div.find_element(By.CSS_SELECTOR, 'div.item-text-content')
title = div_content.find_element(By.CSS_SELECTOR, 'div.item-text-content-title').text
content = div_content.find_element(By.CSS_SELECTOR, 'div.item-text-content-description').text
div_list = div.find_element(By.CSS_SELECTOR, 'div.extra-info-list')
div_list_a = div_list.find_element(By.TAG_NAME, 'a')
author = div_list_a.find_element(By.TAG_NAME, 'span').text
the_time = div_list.find_element(By.CSS_SELECTOR, 'span.extra-info-item').text
data.append([target[j][0], title, content, author, the_time])
print('第{}个新闻插入成功'.format(q+1))
except Exception:
pass
q = q + 1
my_excel.write_to_excel_all(data, '搜狐新闻1.xlsx')
print(target[j][0] + '全部写入成功')
except Exception:
print(target[j][0]+'未找到')
browser.quit()
一定要使用对应你浏览器版本的驱动,否则浏览器肯定弹不出来!!!
有需要可以加学长微信
EchoYouChu