功能:淘宝搜索一个关键字,查询商品信息(最多100页)
说明:下面代码中的cookie有删减,请使用自己的cookie
cookie获取方法可参考本文:Python+selenium使用cookie登录,如何获取cookie_why do not的博客-CSDN博客_python selenium获取cookies
#! /usr/bin/env python
# -*- coding:utf-8 -*-
"""
selenim 可以模拟人去控制浏览器
功能:淘宝搜索一个关键字,查询商品信息(最多100页)
方式:100个页面通过url访问(不通过点击下一页或其他),模拟人去拉动下滑条,直接获取xpath路径数据
fake_useragent.json文件参考本人其他博客
"""
from selenium import webdriver
import time
import re,os
from lxml import etree
from fake_useragent import UserAgent
true = True
false = False
cookies = [
{
"domain": ".taobao.com",
"expirationDate": 1631901735.114169,
"hostOnly": false,
"httpOnly": false,
"name": "_cc_",
"path": "/",
"sameSite": "no_restriction",
"secure": true,
"session": false,
"storeId": "0",
"value": "UIHiLt3xSw%3D%3D",
"id": 1
},
{
"domain": ".taobao.com",
"expirationDate": 1601004935.565767,
"hostOnly": false,
"httpOnly": false,
"name": "_m_h5_tk",
"path": "/",
"sameSite": "no_restriction",
"secure": true,
"session": false,
"storeId": "0",
"value": "83e687d147cdaf7f8c2a68c7133af57f_1600410215568",
"id": 2
},
... ...
{
"domain": "s.taobao.com",
"hostOnly": true,
"httpOnly": true,
"name": "JSESSIONID",
"path": "/",
"sameSite": "unspecified",
"secure": false,
"session": true,
"storeId": "0",
"value": "672120E2C0B4AB8048A221FFD276B810",
"id": 25
},
{
"domain": "s.taobao.com",
"hostOnly": true,
"httpOnly": false,
"name": "lastalitrackid",
"path": "/",
"sameSite": "unspecified",
"secure": false,
"session": true,
"storeId": "0",
"value": "www.taobao.com",
"id": 26
}
]
def driver_chrome():
chrome_options = webdriver.ChromeOptions()
# 添加实验性质的设置参数 add_experimental_option
# 设置为开发者模式
chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
# 去掉开发者警告
chrome_options.add_experimental_option('useAutomationExtension', False)
# 启用无头模式
# chrome_options.add_argument("--headless")
chrome_options.add_argument("--disable-gpu") # 禁用GPU加速
# 添加代理 (很重要 很重要 很重要) 若是直接ua=UserAgent(verify_ssl=False)可能出现超时
ua = UserAgent(path=os.getcwd() + '/fake_useragent.json')
chrome_options.add_argument('user-agent=ua.random')
driver = webdriver.Chrome(executable_path=driver_path, chrome_options=chrome_options)
# 隐性等待,最长等10秒
# 注意:对driver起作用,所以只要设置一次即可,没有必要到处设置
driver.implicitly_wait(10)
return driver
# 登录后,拉动下滑条,采集数据
def draw_down():
# 一次拉一部分,拉一次暂停一会
for x in range(1, 11, 2):
time.sleep(0.5)
# j代表滑动条的位置
j = x/10
js = 'document.documentElement.scrollTop = document.documentElement.scrollHeight * %f' % j
driver.execute_script(js)
# 元素定位,得到页数
def serch_product():
# 定位输入框
driver.find_element_by_xpath('//*[@id="q"]').send_keys(keyword)
# 注意:搜索按钮不一致,判断是否是第一次搜索。这里只搜索一次,所以不用判断
# 不是第一次搜索 driver.find_element_by_xpath('//*[@id="J_SearchForm"]/button').click()
driver.find_element_by_xpath('//*[@id="J_TSearchForm"]/div[1]/button').click()
# 得到页数
pages = driver.find_element_by_xpath('//*[@id="mainsrp-pager"]/div/div/div/div[1]').text
# 解析数字 共 100 页,
pages = int(re.compile('(\d+)').search(pages).group(1))
return pages
def get_product():
# 获取页面所有的商品div //代表任意位置 .代表当前路径
divs = driver.find_elements_by_xpath('//div[@class="items"]/div[@class="item J_MouserOnverReq "]')
for div in divs:
info = div.find_element_by_xpath('.//div[@class="row row-2 title"]').text
price = div.find_element_by_xpath('.//a[@class="J_ClickStat"]').get_attribute('trace-price') + '元'
deal = div.find_element_by_xpath('.//div[@class="deal-cnt"]').text
image = div.find_element_by_xpath('.//div[@class="pic"]/a/img').get_attribute('src')
name = div.find_element_by_xpath('.//div[@class="shop"]/a/span[2]').text
product = {'标题': info, '价格': price, '订单量': deal, '图片': image, '名字': name}
print(product)
def next_page():
pages = serch_product()
draw_down()
get_product()
num = 1
while num != pages:
driver.get('https://s.taobao.com/search?q={}&s={}'.format(keyword,44*num))
num+=1
draw_down()
get_product()
# 方式2:获得页面源代码并解析数据
def get_html_data():
serch_product() # 第一次访问
html = driver.page_source # 打印网页源代码
etr = etree.HTML(html) # 将HTML转化为二进制/html 格式
divs = etr.xpath('//div[@class="grid g-clearfix"]/div[@class="items"]/div')
shop_list = []
for div in divs:
image = div.xpath('.//a/img/@src')[0]
title = div.xpath('.//div[@class="row row-2 title"]/a/text()')[1]
price = div.xpath('.//div[@class="price g_price g_price-highlight"]/span/strong/text()')
deal = div.xpath('.//div[@class="deal-cnt"]/text()')
location = div.xpath('.//div[@class="location"]/text()')
dict = {'标题':title, '图片':image, '价格':price, '销量':deal, '地址':location}
shop_list.append(dict)
print(dict)
if __name__ == '__main__':
driver_path = 'D:\install\chromedriver.exe'
url = "https://www.taobao.com/"
keyword = "手机"
driver = driver_chrome()
driver.get(url) # 打开网页
# 设置cookie
for item in cookies:
if 'sameSite' in item:
del item['sameSite']
driver.add_cookie(item)
# 方式一
# next_page()
# 方式二
get_html_data()
print(shop_list)