说明:下面代码中的cookie有删减,请使用自己的cookie
cookie获取方法可参考本文:Python+selenium使用cookie登录,如何获取cookie_why do not的博客-CSDN博客_python selenium获取cookies
cookies = [
{
"domain": ".taobao.com",
"expirationDate": 1631774914.309654,
"hostOnly": False,
"httpOnly": False,
"name": "_cc_",
"path": "/",
"sameSite": "unspecified",
"secure": False,
"session": False,
"storeId": "0",
"value": "Vq8l%2BKCLiw%3D%3D",
"id": 1
},
... ...
{
"domain": ".taobao.com",
"expirationDate": 1600324479,
"hostOnly": False,
"httpOnly": False,
"name": "xlly_s",
"path": "/",
"sameSite": "no_restriction",
"secure": True,
"session": False,
"storeId": "0",
"value": "1",
"id": 34
}
]
import json
import time
from lxml import etree
from selenium import webdriver
from redis import Redis
from pymongo import MongoClient
class Taobao:
def __init__(self):
self.mdb = MongoClient("127.0.0.1", 27017)["taobao"]
self.conn = Redis(host='127.0.0.1', port=6379)
self.url = "https://www.taobao.com/"
self.headers = {
"Content-Type": "application/json",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36",
}
self.driver_path = r"E:\chromedriver.exe"
def chrome_driver(self):
chrome_options = webdriver.ChromeOptions()
# 添加实验性质的设置参数
# 设置开发者模式启动,该模式下webdriver属性为正常值
chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
chrome_options.add_experimental_option('useAutomationExtension', False)
# 启用无头模式
# chrome_options.add_argument("--headless")
chrome_options.add_argument("--disable-gpu") # 禁用GPU加速
driver = webdriver.Chrome(executable_path=self.driver_path, chrome_options=chrome_options)
# 打开网页
driver.get(self.url)
# 设置cookie
for item in cookies:
driver.add_cookie(item)
time.sleep(1)
data_list = list(self.mdb.tabao_pinpai.find({}).sort("_id",-1)) # 降序排列
for i in data_list:
key = i["five"]
# keyword = ["手机", "电脑", "手表", "数码", "零食", "文具", "建材", "男装", "医药保健"]
# keyword = ["母婴 面膜"]
#
# for key in keyword:
input = driver.find_element_by_id("q") # 通过id定位到input框
input.send_keys(key) # 在输入框内输入手机
# 搜索按钮不一致,判断是否是第一次搜索
if data_list.index(i) == 0:
driver.find_element_by_xpath('//*[@id="J_TSearchForm"]/div[1]/button').click()
else:
driver.find_element_by_xpath('//*[@id="J_SearchForm"]/button').click()
html = driver.page_source # 打印网页源代码
# 封装存储格式
i["html"]=json.dumps(html) # 将字典形式的数据转化为字符串
i["keyword"]=key
etr = etree.HTML(html) # 将HTML转化为二进制/html 格式
# 定位品牌列表 注意:品牌所在位置不是固定不变的,通过标签精准定位到品牌的位置
a_list = etr.xpath('//div[@class="items items-show2line J_Items"]/div[@class="items-inner g-clearfix"]/a')
pinpai_list = []
for a in a_list:
pinpai_name = a.xpath('./span/text()')[0]
pinpai_list.append(pinpai_name)
i["pinpai_list"] = pinpai_list
i.pop("_id")
self.mdb["taobao_pinpai_info"].update({"html": i["html"]}, {"$set": i}, True)
print("is download--->>>:", i["keyword"],i["pinpai_list"])
# 清空搜索框进入下一循环重新输入
driver.find_element_by_id("q").clear()
print("download done!!!")
t = Taobao()
t.chrome_driver()