淘宝品牌数据爬虫

 说明:下面代码中的cookie有删减,请使用自己的cookie

cookie获取方法可参考本文:Python+selenium使用cookie登录,如何获取cookie_why do not的博客-CSDN博客_python selenium获取cookies

cookies = [
    {
        "domain": ".taobao.com",
        "expirationDate": 1631774914.309654,
        "hostOnly": False,
        "httpOnly": False,
        "name": "_cc_",
        "path": "/",
        "sameSite": "unspecified",
        "secure": False,
        "session": False,
        "storeId": "0",
        "value": "Vq8l%2BKCLiw%3D%3D",
        "id": 1
    },
    ... ...
    {
        "domain": ".taobao.com",
        "expirationDate": 1600324479,
        "hostOnly": False,
        "httpOnly": False,
        "name": "xlly_s",
        "path": "/",
        "sameSite": "no_restriction",
        "secure": True,
        "session": False,
        "storeId": "0",
        "value": "1",
        "id": 34
    }
]

import json
import time
from lxml import etree
from selenium import webdriver
from redis import Redis
from pymongo import MongoClient


class Taobao:
    def __init__(self):
        self.mdb = MongoClient("127.0.0.1", 27017)["taobao"]
        self.conn = Redis(host='127.0.0.1', port=6379)
        self.url = "https://www.taobao.com/"  
        self.headers = {
            "Content-Type": "application/json",
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36",
        }
        self.driver_path = r"E:\chromedriver.exe"

    def chrome_driver(self):
        chrome_options = webdriver.ChromeOptions()
        # 添加实验性质的设置参数
        # 设置开发者模式启动,该模式下webdriver属性为正常值
        chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
        chrome_options.add_experimental_option('useAutomationExtension', False)

        # 启用无头模式
        # chrome_options.add_argument("--headless")
        chrome_options.add_argument("--disable-gpu")   # 禁用GPU加速

        driver = webdriver.Chrome(executable_path=self.driver_path, chrome_options=chrome_options)

        # 打开网页
        driver.get(self.url)
        # 设置cookie
        for item in cookies:
            driver.add_cookie(item)
        time.sleep(1)
        data_list = list(self.mdb.tabao_pinpai.find({}).sort("_id",-1)) # 降序排列

        for i in data_list:
            key = i["five"]
            # keyword = ["手机", "电脑", "手表", "数码", "零食", "文具", "建材", "男装", "医药保健"]
            # keyword = ["母婴 面膜"]
            #
            # for key in keyword:
            input = driver.find_element_by_id("q")  # 通过id定位到input框
            input.send_keys(key)  # 在输入框内输入手机

            # 搜索按钮不一致,判断是否是第一次搜索
            if data_list.index(i) == 0:
                driver.find_element_by_xpath('//*[@id="J_TSearchForm"]/div[1]/button').click()
            else:
                driver.find_element_by_xpath('//*[@id="J_SearchForm"]/button').click()

            html = driver.page_source  # 打印网页源代码

            # 封装存储格式
            i["html"]=json.dumps(html)  # 将字典形式的数据转化为字符串
            i["keyword"]=key
            etr = etree.HTML(html)  # 将HTML转化为二进制/html 格式

            # 定位品牌列表   注意:品牌所在位置不是固定不变的,通过标签精准定位到品牌的位置
            a_list = etr.xpath('//div[@class="items items-show2line J_Items"]/div[@class="items-inner g-clearfix"]/a')
            pinpai_list = []
            for a in a_list:
                pinpai_name = a.xpath('./span/text()')[0]
                pinpai_list.append(pinpai_name)
            i["pinpai_list"] = pinpai_list
            i.pop("_id")
            self.mdb["taobao_pinpai_info"].update({"html": i["html"]}, {"$set": i}, True)
            print("is download--->>>:", i["keyword"],i["pinpai_list"])
            # 清空搜索框进入下一循环重新输入
            driver.find_element_by_id("q").clear()
        print("download done!!!")


t = Taobao()
t.chrome_driver()
  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值