selenium绕过js,实现滚动条自动向下滑动,抓取苏宁图书
目标url:烹饪/美食【报价 品牌 口碑评价 测评 正品行货 限时低价 分期】 -苏宁易购
from selenium import webdriver
# 延迟模块
import time
# 解析html
from lxml import etree
# 页面加载等等的异常处理
from selenium.common.exceptions import TimeoutException
import re
import json
if __name__ == '__main__':
# 创建浏览器对象
chrome_obj = webdriver.Chrome()
# 页面等等
chrome_obj.set_page_load_timeout(5)
# 输入网址 >> 烹饪/美食 https://list.suning.com/0-502336-0.html?safp=d488778a.46602.advancedFilter.38&safc=cate.0.0&safpn=10006.502282#search-path
try:
chrome_obj.get('https://list.suning.com/0-502336-0.html?safp=d488778a.46602.advancedFilter.38&safc=cate.0.0&safpn=10006.502282#search-path')
except TimeoutException:
print('超时了......')
chrome_obj.maximize_window()
time.sleep(2)
# 进行进度条的滚动
for i in range(12):
time.sleep(2)
chrome_obj.execute_script(f'document.documentElement.scrollTop={i+1}*1000')
# 发送了很多次请求,页面已经有了120本书
# 获取当前页面的html代码
str_data = chrome_obj.page_source
# 解析书名
html_obj = etree.HTML(str_data)
title_list = html_obj.xpath('//div[@class="res-info"]//a/@aria-label')
print(title_list)
print(len(title_list))
# 解析价格
price_list = []
for i in title_list:
price_list.append(re.findall(r'(.*?)元',i)[0])
print(price_list)
print(len(price_list))
# 保存
with open('苏宁图书01.json','w',encoding="utf-8") as f:
for i in range(len(title_list)):
dict_ = {}
dict_[title_list[i]] = price_list[i]
json_data = json.dumps(dict_,ensure_ascii=False) + ',\n'
f.write(json_data)
print(dict_)