京东商品爬取
1.网址: https:www.jd.com/
2.目标: 输入搜索的商品,爬取内容如下:
1.商品的名称
2.商品的价格
3.评论数量
4.商家名称
3.准备工作
1.搜索框属性值: class=“text”
2.搜索按钮属性值: class=“button”
3.下一页属性值:
能点: class=“pn-next”
不能点: class=“pn-next disabled”
4.页面商品节点对象列表xpath表达式
//div[@id=“J_goodsList”]/ul/li
4.保存数据到csv文件
代码
from selenium import webdriver
import time
import csv
# 创建浏览器对象
# opt = webdriver.ChromeOptions()
# opt.set_headless()
# driver = webdriver.Chrome(options=opt)
driver = webdriver.Chrome()
# 向京东发送请求
driver.get('https://www.jd.com/')
key = input('请输入要爬取的内容:')
driver.find_element_by_class_name('text').send_keys(key)
# 点击搜索
driver.find_element_by_class_name('button').click()
time.sleep(2)
n = 1
# while True:
for i in range(3):
# 执行JS脚本,进度条拉到最下面
driver.execute_script(
'window.scrollTo(0,document.body.scrollHeight)'
)
time.sleep(2)
# 基准xpath,每个商品的节点对象列表
rList = driver.find_elements_by_xpath('//div[@id="J_goodsList"]//ul/li')
for r in rList:
info = r.text.split('\n')
# 价格
price = info[0]
if info[1] != '拍拍':
name = info[1]
# 评论
commit = info[2]
# 商家
market = info[3]
else:
name = info[2]
commit = info[3]
market = info[4]
L = [price, commit, market, name]
# 存入csv文件
with open('京东.csv','a',newline='',encoding='utf-8') as f :
writer = csv.writer(f)
writer.writerow(L)
print('第%d页爬取成功' % i)
n += 1
# 点击下一页
if driver.page_source.find('pn-next disabled') == -1:
driver.find_element_by_class_name('pn-next').click()
time.sleep(1)
else:
break