from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from lxml import etree
import pandas as pd
import time
wd=webdriver.Edge()
wd.get('https://mall.jd.com/view_search-394872-0-4-1-24-1.html')
total_page=3
k=1
sku_list=[]
price_list=[]
title_list=[]
comments_list=[]
while k<=total_page:
#休眠10秒,防止页面显示不出来
time.sleep(10)
#运行Javascript代码。使得页面下拉到底端
wd.execute_script('window.scrollTo(0,document.body.scrollHeight);')
#等待下滑数据缓存
time.sleep(10)
#获取网页源码
resp_text=wd.page_source
#通过etree.HTML()方法将HTML网页源码转化成可以通过XPath处理的格式
page_html=etree.HTML(resp_text)
#理想来讲找到每个应该遍历的商品的XPath,发现规律之后构造便利列表
#页面第一件商品完整的XPath:/html/body/div[3]/div/div[4]/div/div/div[2]/div/div/div/div[2]/ul/li[4]/div
#页面第二件商品完整的XPath:/html/body/div[3]/div/div[4]/div/div/div[2]/div/div/div/div[2]/ul/li[5]
#页面第三件商品完整的XPath:/html/body/div[3]/div/div[4]/div/div/div[2]/div/div/div/div[2]/ul/li[6]
#然后发现规律,构建出便利的列表:/html/body/div[3]/div/div[4]/div/div/div[2]/div/div/div/div[2]/ul/li
#注意这里li[4]不再是列表的概念
li_list = page_html.xpath('/html/body/div[3]/div/div[4]/div/div/div[2]/div/div/div/div[2]/ul/li')
#然后去挖掘网页当中自己想要挖掘的信息的XPath
#商品id在这个中:/html/body/div[3]/div/div[4]/div/div/div[2]/div/div/div/div[2]/ul/li[4]/span
#商品简介:/html/body/div[3]/div/div[4]/div/div/div[2]/div/div/div/div[2]/ul/li [4]/div/div[3]/div[1]
#商品价格:/html/body/div[3]/div/div[4]/div/div/div[2]/div/div/div/div[2]/ul/li [4]/div/div[3]/div[2]
#商品评价:/html/body/div[3]/div/div[4]/div/div/div[2]/div/div/div/div[2]/ul/li [4]/div/div[3]/div[3]
for i in li_list:
#使用相对路径:
#保存商品id
sku = i.xpath('./span/@data-id')[0]
sku_list.append(sku)
#保存商品名称
#要获取文本,要更加深入的挖掘XPath
#XPath为:/html/body/div[3]/div/div[4]/div/div/div[2]/div/div/div/div[2]/ul/li [4]/div/div[3]/div[1]/a
#实际上是这样的:<a href="//item.jd.com/100054264406.html" target="_blank">AppleMacBook Pro 14英寸 M2 Max芯片(12核中央 38核图形)64G 8T 深空灰 笔记本电脑Z17J0004Y【定制机】</a>
title = i.xpath('./div/div[3]/div[1]/a/text()')[0]
title_list.append(title)
#保存商品价格
#实际的网页代码:<span class="jdNum" jdprice="100054264374" preprice="45499.00">45499.00</span>
#XPath为:/html/body/div[3]/div/div[4]/div/div/div[2]/div/div/div/div[2]/ul/li[4]/div/div[3]/div[2]/div/span[2]
price = i.xpath('./div/div[3]/div[2]/div/span[2]/text()')[0]
price_list.append(price)
#保存商品评论数
#实际的网页代码:<em class="jCommentNum" jdcomment="100054264374">1万+</em>
#完整的XPath:/html/body/div[3]/div/div[4]/div/div/div[2]/div/div/div/div[2]/ul/li[4]/div/div[3]/div[3]/a/em
comments=i.xpath('./div/div[3]/div[3]/a/em/text()')[0]
comments_list.append(comments)
#寻找下一页按钮并点击
#这里也是找到下一页的按钮的XPath:/html/body/div[3]/div/div[4]/div/div/div[2]/div/div/div/div[3]/a[5]
next_page=wd.find_element(By.XPATH,"//div/a[text()='下一页']")#是寻找
next_page.click()
k+=1
wd.close
wd.quit()
df={'商品编码':sku_list,'商品标题':title_list,'商品价格':price_list,'商品评论数':comments_list}
sku_df = pd.DataFrame(df)
sku_df.to_excel('JD_Apple.xlsx',index=True)
sku_df
结果如下: