淘宝品牌数据爬虫

why do not

已于 2022-04-04 12:46:22 修改

阅读量1.2k

点赞数

分类专栏：爬虫文章标签： python

于 2020-10-12 16:42:11 首次发布

本文链接：https://blog.csdn.net/qq_42994177/article/details/108670601

版权

爬虫专栏收录该内容

9 篇文章 0 订阅

订阅专栏

说明：下面代码中的cookie有删减，请使用自己的cookie

cookie获取方法可参考本文：Python+selenium使用cookie登录，如何获取cookie_why do not的博客-CSDN博客_python selenium获取cookies

cookies = [
    {
        "domain": ".taobao.com",
        "expirationDate": 1631774914.309654,
        "hostOnly": False,
        "httpOnly": False,
        "name": "_cc_",
        "path": "/",
        "sameSite": "unspecified",
        "secure": False,
        "session": False,
        "storeId": "0",
        "value": "Vq8l%2BKCLiw%3D%3D",
        "id": 1
    },
    ... ...
    {
        "domain": ".taobao.com",
        "expirationDate": 1600324479,
        "hostOnly": False,
        "httpOnly": False,
        "name": "xlly_s",
        "path": "/",
        "sameSite": "no_restriction",
        "secure": True,
        "session": False,
        "storeId": "0",
        "value": "1",
        "id": 34
    }
]

import json
import time
from lxml import etree
from selenium import webdriver
from redis import Redis
from pymongo import MongoClient


class Taobao:
    def __init__(self):
        self.mdb = MongoClient("127.0.0.1", 27017)["taobao"]
        self.conn = Redis(host='127.0.0.1', port=6379)
        self.url = "https://www.taobao.com/"  
        self.headers = {
            "Content-Type": "application/json",
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36",
        }
        self.driver_path = r"E:\chromedriver.exe"

    def chrome_driver(self):
        chrome_options = webdriver.ChromeOptions()
        # 添加实验性质的设置参数
        # 设置开发者模式启动，该模式下webdriver属性为正常值
        chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
        chrome_options.add_experimental_option('useAutomationExtension', False)

        # 启用无头模式
        # chrome_options.add_argument("--headless")
        chrome_options.add_argument("--disable-gpu")   # 禁用GPU加速

        driver = webdriver.Chrome(executable_path=self.driver_path, chrome_options=chrome_options)

        # 打开网页
        driver.get(self.url)
        # 设置cookie
        for item in cookies:
            driver.add_cookie(item)
        time.sleep(1)
        data_list = list(self.mdb.tabao_pinpai.find({}).sort("_id",-1)) # 降序排列

        for i in data_list:
            key = i["five"]
            # keyword = ["手机", "电脑", "手表", "数码", "零食", "文具", "建材", "男装", "医药保健"]
            # keyword = ["母婴 面膜"]
            #
            # for key in keyword:
            input = driver.find_element_by_id("q")  # 通过id定位到input框
            input.send_keys(key)  # 在输入框内输入手机

            # 搜索按钮不一致，判断是否是第一次搜索
            if data_list.index(i) == 0:
                driver.find_element_by_xpath('//*[@id="J_TSearchForm"]/div[1]/button').click()
            else:
                driver.find_element_by_xpath('//*[@id="J_SearchForm"]/button').click()

            html = driver.page_source  # 打印网页源代码

            # 封装存储格式
            i["html"]=json.dumps(html)  # 将字典形式的数据转化为字符串
            i["keyword"]=key
            etr = etree.HTML(html)  # 将HTML转化为二进制/html 格式

            # 定位品牌列表   注意：品牌所在位置不是固定不变的，通过标签精准定位到品牌的位置
            a_list = etr.xpath('//div[@class="items items-show2line J_Items"]/div[@class="items-inner g-clearfix"]/a')
            pinpai_list = []
            for a in a_list:
                pinpai_name = a.xpath('./span/text()')[0]
                pinpai_list.append(pinpai_name)
            i["pinpai_list"] = pinpai_list
            i.pop("_id")
            self.mdb["taobao_pinpai_info"].update({"html": i["html"]}, {"$set": i}, True)
            print("is download--->>>：", i["keyword"],i["pinpai_list"])
            # 清空搜索框进入下一循环重新输入
            driver.find_element_by_id("q").clear()
        print("download done！！！")


t = Taobao()
t.chrome_driver()

why do not

关注

0
点赞
踩
2

收藏

觉得还不错? 一键收藏
0
评论
淘宝品牌数据爬虫

cookies = [ { "domain": ".taobao.com", "expirationDate": 1631774914.309654, "hostOnly": False, "httpOnly": False, "name": "_cc_", "path": "/", "sameSite": "unspecified", "secure": False,.
复制链接

扫一扫

专栏目录