淘宝爬取某一商品所有页面信息案例

why do not

已于 2022-04-04 12:37:48 修改

阅读量1.3k

点赞数 1

分类专栏：爬虫文章标签： python

于 2020-10-12 16:49:32 首次发布

本文链接：https://blog.csdn.net/qq_42994177/article/details/108670649

版权

爬虫专栏收录该内容

9 篇文章 0 订阅

订阅专栏

功能：淘宝搜索一个关键字，查询商品信息（最多100页）

说明：下面代码中的cookie有删减，请使用自己的cookie

cookie获取方法可参考本文：Python+selenium使用cookie登录，如何获取cookie_why do not的博客-CSDN博客_python selenium获取cookies

#! /usr/bin/env python 
# -*- coding:utf-8 -*-
"""
selenim 可以模拟人去控制浏览器
功能：淘宝搜索一个关键字，查询商品信息（最多100页）
方式：100个页面通过url访问（不通过点击下一页或其他），模拟人去拉动下滑条，直接获取xpath路径数据
fake_useragent.json文件参考本人其他博客
"""
from selenium import webdriver
import time
import re,os
from lxml import etree
from fake_useragent import UserAgent


true = True
false = False
cookies = [
{
    "domain": ".taobao.com",
    "expirationDate": 1631901735.114169,
    "hostOnly": false,
    "httpOnly": false,
    "name": "_cc_",
    "path": "/",
    "sameSite": "no_restriction",
    "secure": true,
    "session": false,
    "storeId": "0",
    "value": "UIHiLt3xSw%3D%3D",
    "id": 1
},
{
    "domain": ".taobao.com",
    "expirationDate": 1601004935.565767,
    "hostOnly": false,
    "httpOnly": false,
    "name": "_m_h5_tk",
    "path": "/",
    "sameSite": "no_restriction",
    "secure": true,
    "session": false,
    "storeId": "0",
    "value": "83e687d147cdaf7f8c2a68c7133af57f_1600410215568",
    "id": 2
},
... ...
{
    "domain": "s.taobao.com",
    "hostOnly": true,
    "httpOnly": true,
    "name": "JSESSIONID",
    "path": "/",
    "sameSite": "unspecified",
    "secure": false,
    "session": true,
    "storeId": "0",
    "value": "672120E2C0B4AB8048A221FFD276B810",
    "id": 25
},
{
    "domain": "s.taobao.com",
    "hostOnly": true,
    "httpOnly": false,
    "name": "lastalitrackid",
    "path": "/",
    "sameSite": "unspecified",
    "secure": false,
    "session": true,
    "storeId": "0",
    "value": "www.taobao.com",
    "id": 26
}
]


def driver_chrome():
    chrome_options = webdriver.ChromeOptions()
    # 添加实验性质的设置参数 add_experimental_option
    # 设置为开发者模式
    chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
    # 去掉开发者警告
    chrome_options.add_experimental_option('useAutomationExtension', False)
    # 启用无头模式
    # chrome_options.add_argument("--headless")
    chrome_options.add_argument("--disable-gpu")  # 禁用GPU加速
    # 添加代理  (很重要  很重要  很重要)  若是直接ua=UserAgent(verify_ssl=False)可能出现超时
    ua = UserAgent(path=os.getcwd() + '/fake_useragent.json')
    chrome_options.add_argument('user-agent=ua.random')

    driver = webdriver.Chrome(executable_path=driver_path, chrome_options=chrome_options)
    # 隐性等待，最长等10秒
    # 注意：对driver起作用，所以只要设置一次即可，没有必要到处设置
    driver.implicitly_wait(10)

    return driver


# 登录后，拉动下滑条，采集数据
def draw_down():
    # 一次拉一部分，拉一次暂停一会
    for x in range(1, 11, 2):
        time.sleep(0.5)
        # j代表滑动条的位置
        j = x/10
        js = 'document.documentElement.scrollTop = document.documentElement.scrollHeight * %f' % j
        driver.execute_script(js)


# 元素定位，得到页数
def serch_product():
    # 定位输入框
    driver.find_element_by_xpath('//*[@id="q"]').send_keys(keyword)
    # 注意：搜索按钮不一致，判断是否是第一次搜索。这里只搜索一次，所以不用判断
    # 不是第一次搜索 driver.find_element_by_xpath('//*[@id="J_SearchForm"]/button').click()
    driver.find_element_by_xpath('//*[@id="J_TSearchForm"]/div[1]/button').click()
    # 得到页数
    pages = driver.find_element_by_xpath('//*[@id="mainsrp-pager"]/div/div/div/div[1]').text
    # 解析数字  共 100 页，
    pages = int(re.compile('(\d+)').search(pages).group(1))
    return pages


def get_product():
    # 获取页面所有的商品div    //代表任意位置    .代表当前路径
    divs = driver.find_elements_by_xpath('//div[@class="items"]/div[@class="item J_MouserOnverReq  "]')
    for div in divs:
        info = div.find_element_by_xpath('.//div[@class="row row-2 title"]').text
        price = div.find_element_by_xpath('.//a[@class="J_ClickStat"]').get_attribute('trace-price') + '元'
        deal = div.find_element_by_xpath('.//div[@class="deal-cnt"]').text
        image = div.find_element_by_xpath('.//div[@class="pic"]/a/img').get_attribute('src')
        name = div.find_element_by_xpath('.//div[@class="shop"]/a/span[2]').text

        product = {'标题': info, '价格': price, '订单量': deal, '图片': image, '名字': name}
        print(product)


def next_page():
    pages = serch_product()
    draw_down()
    get_product()
    num = 1
    while num != pages:
        driver.get('https://s.taobao.com/search?q={}&s={}'.format(keyword,44*num))
        num+=1
        draw_down()
        get_product()


# 方式2：获得页面源代码并解析数据
def get_html_data():
    serch_product()  # 第一次访问
    html = driver.page_source  # 打印网页源代码
    etr = etree.HTML(html)  # 将HTML转化为二进制/html 格式
    divs = etr.xpath('//div[@class="grid g-clearfix"]/div[@class="items"]/div')
    shop_list = []
    for div in divs:
        image = div.xpath('.//a/img/@src')[0]
        title = div.xpath('.//div[@class="row row-2 title"]/a/text()')[1]
        price = div.xpath('.//div[@class="price g_price g_price-highlight"]/span/strong/text()')
        deal = div.xpath('.//div[@class="deal-cnt"]/text()')
        location = div.xpath('.//div[@class="location"]/text()')

        dict = {'标题':title, '图片':image, '价格':price, '销量':deal, '地址':location}
        shop_list.append(dict)
        print(dict)





if __name__ == '__main__':
    driver_path = 'D:\install\chromedriver.exe'
    url = "https://www.taobao.com/"
    keyword = "手机"


    driver = driver_chrome()
    driver.get(url)  # 打开网页
    # 设置cookie
    for item in cookies:
        if 'sameSite' in item:
            del item['sameSite']
        driver.add_cookie(item)
    # 方式一
    # next_page()
    # 方式二
    get_html_data()
    print(shop_list)

why do not

关注

1
点赞
踩
2

收藏

觉得还不错? 一键收藏
0
评论
淘宝爬取某一商品所有页面信息案例

#! /usr/bin/env python # -*- coding:utf-8 -*-"""selenim 可以模拟人去控制浏览器功能：淘宝搜索一个关键字，查询商品信息（最多100页）方式：100个页面通过url访问（不通过点击下一页或其他），模拟人去拉动下滑条，直接获取xpath路径数据fake_useragent.json文件参考本人其他博客"""from selenium import webdriverimport timeimport re,osfrom lxml im.
复制链接

扫一扫