天猫店铺半自动商品详情数据
把东西都记到网上,不然每次换电脑之后都得从头上网找资料。
自动登录没实现,可以手动登录或者换扫码登陆,休息一下那个验证不能解决,学建ip池看看能不能解决。
from selenium import webdriver
import time
import pandas as pd
import re
import datetime
import random
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium.webdriver import Chrome,ChromeOptions
#设置网页加载模式
desired_capabilities = DesiredCapabilities.CHROME
desired_capabilities["pageLoadStrategy"] = "none"
#修改windows.navigator.webdriver,防机器人识别机制,selenium自动登陆判别机制
chrome_options = webdriver.ChromeOptions()
chrome_options.add_experimental_option('excludeSwitches', ['enable-automation'])
driver = webdriver.Chrome(chrome_options=chrome_options,desired_capabilities=desired_capabilities)
#CDP执行JavaScript 代码 重定义windows.navigator.webdriver的值
driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
"source": """
Object.defineProperty(navigator, 'webdriver', {
get: () => undefined
})
"""
})
url = 'https://www.taobao.com/'
#driver.implicitly_wait(10)
#打开淘宝
driver.get(url)
#driver.get("https://www.taobao.com/")
#获取今日日期
today=datetime.datetime.now()
today=today.strftime('%Y-%m-%d')
#要先手动登录淘宝
url2=input('输入要访问的店铺地址')
driver.get(url2)
#建立空的total_info 储存数据
total_info=pd.DataFrame()
#获得当页“下一页”按钮以上的商品详细数据,目前只获取一页,可以用点击下一页或者拼接url的方式爬多页,加了很多等待所以比较慢
divs = driver.find_elements_by_xpath('//*[@id="J_ShopSearchResult"]/div/div[3]/div') # 把每个商品按照div分类
nu=0
for i in range(len(divs)):
try:
divs[i].find_element_by_xpath('./a[1]').text
break
except:
nu+=1
#print(nu)
divs=divs[:nu] # 取‘下一页’之前的商品
for div in divs:
l_divs=div.find_elements_by_xpath('./dl')
for l_div in l_divs:
#等待时间
time.sleep(random.uniform(1.1,2))
pic=l_div.find_element_by_xpath(".//img").get_attribute('src')# 图片
if pic=='https://assets.alicdn.com/s.gif':
js="window.open('%s')"%add
driver.execute_script(js)
#等待时间
time.sleep(random.uniform(1.1,2))
n = driver.window_handles
driver.switch_to.window (n[1])
#等待时间
time.sleep(random.uniform(1.1,2))
pic=driver.find_element_by_xpath('//*[@id="J_ImgBooth"]').get_attribute('src')# 图片
#等待时间
time.sleep(random.uniform(1.1,2))
driver.close()
#等待时间
time.sleep(random.uniform(1.1,2))
driver.switch_to.window (n[0])
info = l_div.find_element_by_xpath(".//img").get_attribute('alt') # 获取商品信息
price = l_div.find_element_by_xpath(".//div[@class='cprice-area']").text + "元" # 获取商品价格
dsrs = l_div.find_element_by_xpath('//*[@id="shopExtra"]/div[1]/a/strong').text # 获取店铺名称
add = l_div.find_element_by_xpath(".//a").get_attribute('href') # 获取地址
try:
buyer_nums = l_div.find_element_by_xpath(".//div[@class='sale-area']").text # 获取商品购买人数
td=l_div.find_element_by_xpath('//*[@id="J_DetailMeta"]/div[1]/div[1]/div/div[1]/p').text # 获取特点 (商品页肯定没有,随便加进去的)
cf=l_div.find_element_by_xpath('//*[@id="J_DetailMeta"]/div[1]/div[1]/div/div[1]/p').text # 材料成分(商品页肯定没有,随便加进去的)
except:
js="window.open('%s')"%add
driver.execute_script(js)
#等待时间
time.sleep(random.uniform(1.1,2))
n = driver.window_handles
driver.switch_to.window (n[1])
try:
driver.find_element_by_xpath('//*[@id="sufei-dialog-close"]').click()
except:
pass
#等待时间
time.sleep(random.uniform(1.1,2))
driver.set_page_load_timeout(5)
try:
pass
except TimeoutException:
driver.execute_script('window.stop ? window.stop() : document.execCommand("Stop");')
scroll=random.uniform(10,2000)
js="var q=document.documentElement.scrollTop=%s"%scroll
driver.execute_script(js)
try:
buyer_nums=driver.find_element_by_xpath('//*[@id="J_DetailMeta"]/div[1]/div[1]/div/ul/li[1]/div/span[2]').text# 销量
except:
buyer_nums="--"
try:
td=driver.find_element_by_xpath('//*[@id="J_DetailMeta"]/div[1]/div[1]/div/div[1]/p').text # 获取特点
except:
td="--"
try:
cf_tt=driver.find_elements_by_xpath('//*[@id="J_AttrUL"]/li')
cf=cf_tt[-1].text# 材料成分
except:
cf="--"
#等待时间
time.sleep(random.uniform(1.1,2))
driver.close()
#等待时间
time.sleep(random.uniform(1.1,2))
driver.switch_to.window (n[0])
#清除stroage(好像没有,没啥影响就加上了)
js2 = 'window.localStorage.clear();'
driver.execute_script(js2)
date=today
print(pic,info, price, buyer_nums, dsrs, add, td,cf,date,sep='|')
tt=dict(pic=pic,info=info,price=price,buyer_nums=buyer_nums,dsrs=dsrs,add=add,td=td,cf=cf,date=date)
total_info=total_info.append(tt,ignore_index=True)
js4 = "arguments[0].scrollIntoView();"
driver.execute_script(js4, div)
total_info=total_info[["pic","info", "price", "buyer_nums", "dsrs", "add","td","cf","date"]]
# 目前导出到g:\TT.xlsx
total_info.to_excel(r'g:\TT.xlsx',index=False )
资料来源:
https://blog.csdn.net/qq_35866846/article/details/105712147