python+selenium爬取淘宝数据
看了很多的淘宝爬虫,大多数都只有搜索页的淘宝数据,我就写了详情页的数据。
包括 标题,店铺, 款式 ,款式对应的图片和价格,还有商品参数的一些信息。
最后使用openpyxl将数据写入到excel中。
该代码可用于 只有颜色分类、套餐类型的商品款式。超出这两种的不适用
'''
淘宝 扫地机器人 源代码
'''
import time
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
# from selenium.webdriver.edge.service import Service
import json
import pandas as pd
import openpyxl
# 获取cookie,只运行一次,目的是把cookie信息保存在本地,方便后续爬虫不需重复手动登录
def hq_cookie():
weba = webdriver.Chrome()
weba.maximize_window()
weba.get('https://www.taobao.com/')
weba.delete_all_cookies()
time.sleep(50) # 50秒内登录好,根据自己的情况改哈
dictcookies = weba.get_cookies() # 拿下cookies 格式为字典
jsoncookies = json.dumps(dictcookies) # 字典转json
with open('../cookie/cookiesTaobao.txt', 'w') as f:
f.write(jsoncookies) # 写入cookies文件
weba.quit()
'''
爬取单页数据
1. cookie登录后,跳转到关键词搜索页的界面
2. 开发者分析页面结构,进行定位提取数据(商品标题,价格,销量,产地,店铺)
'''
def main(keyword):
web = webdriver.Chrome()
web.maximize_window()
url = 'https://www.taobao.com/'
web.get(url)
with open('../cookie/cookiesTaobao.txt', 'r') as f:
cookies = f.read()
cookies = json.loads(cookies)
#装载cookie
for cookie in cookies:
# print(cookie)
if 'expiry' in cookie:
del cookie['expiry']
else:
web.add_cookie(cookie)
web.get(url)
time.sleep(5)
#关键词搜索: 扫地机器人
'''
能不能设置成 输入关键字 点击后进行搜索
封装成方法
'''
input = web.find_element(By.ID, 'q') # 搜索框
input.clear()
input.send_keys(keyword)
button = web.find_element(By.XPATH, "//button[@class='btn-search tb-bg']") # 定位搜索按钮
button.click()
# 扫地机器人的url
# url = 'https://s.1688.com/selloffer/offer_search.htm?keywords=%C9%A8%B5%D8%BB%FA%C6%F7%C8%CB&n=y&netType=1%2C11%2C16&spm=a260k.dacugeneral.search.0'
# web.execute_script(f"window.open('{url}', 'new_window')")
time.sleep(5)
# web.switch_to.window(web.window_handles[-1])
# 获取新页面句柄
new_window_handles = web.window_handles
web.switch_to.window(new_window_handles[-1])
time.sleep(2)
#点击销量,从高到低排序
web.find_element(By.XPATH,"//*[@id='sortBarWrap']/div[1]/div[1]/div/div[1]/div/div/div/ul/li[2]/div").click()
time.sleep(5)
single(web)
time.sleep(2)
# 爬取总计页数
k = web.find_element(By.XPATH, "//*[@class='next-pagination-display']")
cs = k.text.split("/")[-1]
print(f"共计{cs}页数据")
# 开始爬取多页数据
hqsj(web, int(cs) - 1)
print("爬虫结束!!")
web.quit()
#爬取当页数据
def single(web):
# obj_list_row = [] #暂时存储每页的商品详情链接
titles = []
jgs = []
# 已付款
fkrss = []
cdds = []
dps = []
urls = []
functions = []
# 已售
solds = []
url_temp = []
# 滑动页面到底部
for i in range(3):
web.execute_script('window.scrollTo(0,document.body.scrollHeight)') # 滚动页面函数
time.sleep(5)
# xpath定位
# obj_tit = web.find_elements(By.XPATH, "//div[@class='Title--descWrapper--HqxzYq0 ']")
obj_jg_xl = web.find_elements(By.XPATH, "//div[@class='Price--priceWrapper--Q0Dn7pN ']")
obj_dp = web.find_elements(By.XPATH, "//a[@class='ShopInfo--shopName--rg6mGmy']")
obj_url_list = web.find_elements(By.XPATH,"//a[@class='Card--doubleCardWrapper--L2XFE73']")
# ,len(obj_tit)
print("...",len(obj_dp),len(obj_jg_xl),len(obj_url_list))
for i in range(len(obj_jg_xl)):
# for i in range(20): #0-19
# title =obj_tit[i].find_element(By.XPATH,"//div[@class='Title--descWrapper--HqxzYq0 ']/div[1]/span[1]").text.replace(" ", "").strip() # 标题
# print("标题:",title)
# title = obj_tit[i].text.replace(" ", "").strip() # 标题
dp = obj_dp[i].text.replace(" ", "").strip() # 店铺名称
hb = obj_jg_xl[i].find_elements(By.XPATH, ".//span")[0].text.strip().replace(" ", "") # ¥
jgz = obj_jg_xl[i].find_elements(By.XPATH, ".//span")[1].text.strip().replace(" ", "") # 4800
jgx = obj_jg_xl[i].find_elements(By.XPATH, ".//span")[2].text.strip().replace(" ", "") # .80
fkrs = obj_jg_xl[i].find_elements(By.XPATH, ".//span")[3].text.strip().replace(" ", "").split("人")[0] # 几人收货
cd = obj_jg_xl[i].find_elements(By.XPATH, ".//div")
cdd = " "
for i in range(len(cd)): # cd[0]安徽 cd[1]芜湖
cdd = cdd + cd[i].text.strip()
jg = hb + jgz + jgx # ¥4800.80
# print(title, jg, fkrs, cdd,dp)
# titles.append(title)
jgs.append(jg) #价格
fkrss.append(fkrs) # 几人收货
cdds.append(cdd)
dps.append(dp) #店铺
#获取商品链接
for link in obj_url_list:
href = link.get_attribute("href")
# print(href)
urls.append(href)
url_temp.append(href)
obj_detail(web,url_temp,jgs,fkrss,dps)
#翻页商品信息
def hqsj(web, cs):
for i in range(cs):
#下一页按钮
an = web.find_element(By.XPATH,
"//*[@class='next-icon next-icon-arrow-right next-xs next-btn-icon next-icon-last next-pagination-icon-next']")
web.execute_script("arguments[0].click();", an)
time.sleep(5)
single(web)
#商品详情页信息
def obj_detail(web,urls,jgs,fkrss,dps):
global row
global number #显示number为外部全局变量,可以在方法内部被修改
for link in urls:
obj_list_row = [] #暂时存储每页的商品详情链接
index_place = urls.index(link)