说说淘宝吧最近没少研究淘宝网,现在淘宝跟别的平台不一样的地方是他需要登录我们才能搜索商品抓取数据,常规的方法呢去请求url拿数据我们需要带上cookie但是有一部分数据也是拿不到的从app拿还需要逆向解析在下也是属实是不太会逆向解析目前,所以也就自然想到了selenium去抓我想要的数据了,开始我上博客上搜了几个例子发现用selenium去切换账号密码登录时,现在的淘宝会检测出来selenium框架,在我们登录完滑动那个滑动验证码的时候竟然会无限报错,去年还没这个样子应该是淘宝的最新反爬措施,于是乎我上csdn就找解决方法看看有没有高手已经发现这个问题,果不其然找到了两个关于这个问题的解法,一个是在本地替换一段js代码通过fiddler抓包替换,亲测确实有效,另一种是通过切换微博账号登录,我们要用我们的微博账号绑定淘宝账号然后切换微博登录输入微博账号密码竟然也成功了,但是爬着爬着还是会无限的出滑动验证码组合验证码包括封禁我们的淘宝账号的问题,也就是说以上两种方法都有缺陷并且很是麻烦不理想.于是乎我就想到了用扫码登录也参考了一些别人写的代码,通过自己的不断优化修改最终还是成功大批量的抓取到淘宝网商品数据.这里为什么不把详情页也写进去拿详情信息呢,首先淘宝会给我们100页商品数据每页大概48个商品那我们要是挨个进入详情页去拿详情页数据会极大的延缓效率增加淘宝账号被封的风险,所以我考虑到把详情链接取出来之后再单独写一个详情信息页面,然后通过商品的详情链接再去匹配详情页面里面的数据简直是轻而易举.
说下代码我设置了两种存储方式mongodb和本地csv的,mongodb的已经让我注释掉了暂时还不是很需要,另外淘宝的反爬是针对我们请求次数的代理IP的话不是太有什么效果的,经过我观察设置延时应该是目前最有效的方法了,所以我设置延时减少了被封账号的几率.
列表页
from lxml import etree
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from bs4 import BeautifulSoup
import pymongo
import time
import re
# browser = webdriver.Firefox()
# browser.maximize_window()#最大化窗口
# wait = WebDriverWait(browser, 25) #等待
def search(ky,browser,wait):
'''
Desc:
模拟搜索关键字keyword,并点击搜索按钮,进行跳转
param:
keyword -- 搜索用的关键词
return:
total.text -- 当前关键字下的总页数
'''
try:
print('正在搜索')
browser.get('https://www.taobao.com/') #模拟浏览器进入淘宝网
#等待直到局部元素显示出来,这里的局部元素为淘宝网页搜索框部分
input = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "#q")))
#等待直到元素可被点击,这里的元素为搜索按钮
submit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#J_TSearchForm > div.search-button > button')))
input.send_keys(ky) #在输入框调用send_keys方法模拟输入关键字
submit.click() #模拟点击搜索按钮操作
try:
button = browser.find_element_by_id('J_Static2Quick')
button.click()
except:
pass
html = browser.page_source # 页面渲染完毕后生成html
html_login = etree.HTML(html)
login_link = html_login.xpath('//*[@id="J_QRCodeImg"]/img/@src')
print(login_link)
login = ({'name':ky,'login_link':login_link})
save1(login)
# print('*'*40)
# try:
# # print(1)
# # duanxin = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'body > div.aq_durex_overlay')))
# print(2)
# send_duanxin = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,'#J_Phone > ul > li.new-phone-li.new-phone-li2 > input.button'))) # 点击接收验证码
# print(3)
# # send_duanxin.send_keys(Keys.ENTER)
# send_duanxin.click()
#
# duanxin = input('请输入手机收到的验证码')
# input_duanxin = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#J_Phone > ul > li.new-phone-li.new-phone-li4 > input'))) # 输入验证码框
# print(4)
# input_duanxin.send_keys(duanxin)
#
# submit_duanxin = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,'#J_FooterSubmitBtn'))) # 确定
# print(5)
# submit_duanxin.send_keys(Keys.ENTER)
# submit_duanxin.click()
# except:
# pass
# print(0)
# browser.refresh() # 刷新网页
#点击之后,等待页面刷新,这里的条件为直到局部元素显示出来,这里的局部元素为下一页的总页数部分
total = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > div.total')))
return total.text
#发生延时异常时,重新调用search()方法
except TimeoutException:
browser.refresh()
search(ky, browser, wait)
def get_products(browser,wait,ky):
'''
Desc:
使用BeautifulSoup解析每一个产品的信息(图片、价格、付款人数、标题、店铺名、店铺地点)
'''
#等待每一个图片元素加载出来
print('获取产品信息')
wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "#mainsrp-itemlist .items .item")))
html = browser.page_source
soup = BeautifulSoup(html, 'html.parser')
items = soup.select('#mainsrp-itemlist .items .item')
#遍历把商品信息按列表存储
for item in items:
link = item.select('.J_ClickStat')[0]['href']
if link[0:6] != 'https:':
link = 'https:' +link
title = item.select('.title')[0].text.strip()
price = item.select('.price')[0].text.strip()
sales = item.select('.deal-cnt')[0].text[:-3]
store = item.select('.shop')[0].text.strip()
store_address = item.select('.location')[0].text.strip()
picture = item.select('.pic img')[0].text.strip()
print(link)
# print(title)
# print(price)
# print(sales)
# print(store)
# print(store_address)
# print(picture)
time.sleep(2)
info = ({'name':ky,'link': link,'title': title, 'price': price, 'sales': sales, 'store': store, 'store_address': store_address, 'picture': picture})
save(info) # 保存到Mongo数据库中
# with open ('淘宝商品列表页详情链接.txt','a') as f:
# f.write(link)
#
# with open('淘宝商品列表页详情链接.txt','r') as fp:
# for line in fp:
# line = line.strip('\n')
# print (line)
# with open('淘宝_'+fr'{keyword}.csv', "a", newline='',encoding='utf-8-sig') as csvfile:
# csv_writer = csv.writer(csvfile, delimiter = ',')
# # csv_writer.writerows(zip([title],[price],[sales],[store],[store_address],[link],[picture]))
# csv_writer.writerow([
# title,price,sales,store,store_address,link,picture
# ])
# save_to_mongo(product) #保存到Mongo数据库中
#翻页
def next_page(page_number,browser,wait,ky):
'''
Desc:
输入下一页页码,并点击确定,进行跳转
param:
page_number -- 搜索下一页的页码
'''
try:
print('正在翻页', page_number)
time.sleep(5)
#等待直到局部元素显示出来,这里的局部元素为到第[2]页中的[..]
input = wait.until(
EC.presence_of_element_located((By.CSS_SELECTOR, "#mainsrp-pager > div > div > div > div.form > input"))
)
#等待直到元素可被点击,这里的元素为输入页码后的的确定按钮
submit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > div.form > span.btn.J_Submit')))
input.clear() #清除当前输入框中的内容
input.send_keys(page_number) #把下一页的页码传入输入框中
submit.send_keys(Keys.ENTER) #模拟点击确定按钮,跳转到下一页的操作
#点击之后,等待页面刷新,这里的条件为直到局部元素显示出来,这里的局部元素为数字页码在填充方框这个元素中
wait.until(
EC.text_to_be_present_in_element((By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > ul > li.item.active > span'), str(page_number))
)
get_products(browser,wait,ky) #解析每一页产品的信息
#发生延时异常时,重新调用next_page(page_number)方法
except TimeoutException:
browser.refresh()
next_page(page_number, browser, wait,ky)
# 配置mongodb
def save(result):
client = pymongo.MongoClient('localhost', 27017)
dbname = client['spider']
MONGO_TABLE = dbname['tblb']
MONGO_TABLE.insert(result)
def save1(result):
client = pymongo.MongoClient('localhost', 27017)
dbname = client['spider']
MONGO_TABLE = dbname['login']
MONGO_TABLE.insert(result)
def main(ky,start_page,end_page):
browser = webdriver.Chrome()
browser.maximize_window() # 最大化窗口
wait = WebDriverWait(browser, 180) # 等待
total = search(ky,browser,wait)
#使用strip方法去除字符串中的不需要的内容,取出其中的数字
#也可以使用正则表达式, \d表示匹配任意的一个10进制数,+表示匹配前边的原子1次或多次
# total = int(re.compile('(\d+)').search(total).group(1))
try:
total = int(total.lstrip('共 ').rstrip(' 页,'))
except:
pass
for i in range(start_page,end_page+1):
time.sleep(1)
next_page(i,browser,wait,ky)
# browser.close()
if __name__ == '__main__':
ky = input('请输入爬取商品名称:')
start = time.time()
start_page = int(input('请输入要爬取的起始页(1-100):'))
end_page = int(input('请输入要爬取的结束页(1-100):'))
main(ky,start_page,end_page)
print('商品爬取完毕')
end = time.time()
print(end - start)
详情页
import csv
import time
from selenium import webdriver
import pymongo
def get_detail(ky):
# ky = input("请输入搜索商品链接:")
urls = ky.split(';') # 通过'https:'对整体url进行分割
# print(urls)
for i in urls: # 同时对所有的url进行遍历
url = i # 分离后形成单个的url加上'https:'以便访问
# print(url)
#淘宝信息匹配
if url[0:23] == 'https://item.taobao.com': # 通过url判断是否淘宝网址
options = webdriver.FirefoxOptions()
options.add_argument('--disable-gpu') # 谷歌文档提到需要加上这个属性来规避bug
options.add_argument('--hide-scrollbars') # 隐藏滚动条, 应对一些特殊页面
options.add_argument('blink-settings=imagesEnabled=false') # 不加载图片, 提升速度
options.add_argument('--headless') # 浏览器不提供可视化页面. linux下如果系统不支持可视化不加这条会启动失败
browser = webdriver.Firefox(options=options)
browser.maximize_window()
# browser = webdriver.Chrome() #谷歌浏览器
# browser = webdriver.Firefox() #火狐浏览器
browser.get(url)
try:
browser.find_element_by_id('sufei-dialog-close').click()
except:
pass
time.sleep(1)
# wait = WebDriverWait(browser, 5) #等待
# browser.maximize_window()#最大化窗口
# 展示图一
try:
picture1 = browser.find_element_by_css_selector('#J_UlThumb > li:nth-child(1) > div:nth-child(1) > a:nth-child(1) > img:nth-child(1)').get_attribute('data-src')
picture1 = picture1.replace('_50x50.jpg','_430x430.jpg')
except:
picture1 = ' '
# print('展示图一:',picture1)
# 展示图二
try:
picture2 = browser.find_element_by_css_selector('#J_UlThumb > li:nth-child(2) > div:nth-child(1) > a:nth-child(1) > img:nth-child(1)').get_attribute('data-src')
picture2 = picture2.replace('_50x50.jpg','_430x430.jpg')
except:
picture2 = ' '
# print('展示图二:',picture2)
# 展示图三
try:
picture3 = browser.find_element_by_css_selector('#J_UlThumb > li:nth-child(3) > div:nth-child(1) > a:nth-child(1) > img:nth-child(1)').get_attribute('data-src')
picture3 = picture3.replace('_50x50.jpg','_430x430.jpg')
except:
picture3 = ' '
# print('展示图三:',picture3)
# 展示图四
try:
picture4 = browser.find_element_by_css_selector('#J_UlThumb > li:nth-child(4) > div:nth-child(1) > a:nth-child(1) > img:nth-child(1)').get_attribute('data-src')
picture4 = picture4.replace('_50x50.jpg','_430x430.jpg')
except:
picture4 = ' '
# print('展示图四:',picture4)
# 展示图五
try:
picture5 = browser.find_element_by_css_selector('#J_UlThumb > li:nth-child(5) > div:nth-child(1) > a:nth-child(1) > img:nth-child(1)').get_attribute('data-src')
picture5 = picture5.replace('_50x50.jpg','_430x430.jpg')
except:
picture5 = ' '
# print('展示图五:',picture5)
# 详情链接
# print('详情链接:',url)
# 商品标题
try:
title = browser.find_element_by_css_selector('.tb-main-title').text
except:
title = ' '
print('商品标题:',title)
# 店铺名称
try:
set_meal = browser.find_element_by_css_selector(".tb-shop-name > dl:nth-child(1) > dd:nth-child(1) > strong:nth-child(1) > a:nth-child(1)").text
except:
set_meal = ' '
print('店铺名称:',set_meal)
# 发货地点
try:
address = browser.find_element_by_id('J-From').text
except:
address = ' '
print('发货地点:',address)
# 产品价格
try:
price = browser.find_element_by_class_name("tb-rmb-num").text
except:
price = ' '
print('产品价格:',price)
# 当月销量
try:
mouth_sales = browser.find_element_by_id('J_SellCounter').text
except:
mouth_sales = ' '
print('当月销量:',mouth_sales)
# 评价人数
try:
evaluation = browser.find_element_by_id('J_RateCounter').text
except:
evaluation = ' '
print('评价人数:',evaluation)
# 收藏人气
try:
collect = browser.find_element_by_class_name("J_FavCount").text
collect = collect.replace('人气','')
collect = collect.replace('(','')
collect = collect.replace(')','')
except:
collect = ' '
print('收藏人气:',collect)
# 评分描述
try:
ms = browser.find_element_by_class_name('tb-shop-rate').text
ms = ms.split('\n')
ms1 = ms[::]
grade = ms1[0]+':'+ms1[1]+' '+ms1[2]+':'+ms1[3]+' '+ms1[4]+':'+ms1[5]
except:
grade = ' '
print('评分描述:',grade)
#套餐属性
try:
combo = browser.find_element_by_id('J_isku').text
combo = combo.split('\n')
combo = combo[0:-1:1]
except:
combo = ' '
print('属性套餐:', combo)
# 商品参数
try:
details = browser.find_element_by_css_selector(
'#attributes').text
details = details.split('\n')
except:
details = ' '
print('商品详情', details)
# with open('淘宝商品详情信息.csv', 'w', newline='',encoding='utf-8-sig') as f:
# csv_writer = csv.writer(f, delimiter=' ')
# csv_writer.writerow([
# url, picture1, picture2, picture3, picture4, picture5, title, set_meal, address, price, evaluation, details
# ])
# try:
# info = [url, picture1, picture2, picture3, picture4, picture5, title, set_meal, address, price,mouth_sales, evaluation,collect,grade,combo, details]
# taobao_save(info)
# except:
# pass
try:
info = (
{'details_url': url, 'picture1': picture1, 'picture2': picture2, 'picture3': picture3, 'picture4': picture4, 'picture5': picture5, 'title': title,
'set_meal': set_meal, 'address': address, 'price': price, 'mouth_sales':mouth_sales,'evaluation': evaluation, 'collect':collect,'grade':grade,'combo': combo, 'details': details})
taobao_save(info)
except:
pass
# 关闭内核浏览器
browser.close()
browser.quit()
#天猫商品信息匹配
elif url[0:24] == 'https://detail.tmall.com':# 通过url判断是否天猫网址
options = webdriver.FirefoxOptions()
options.add_argument('--disable-gpu') # 谷歌文档提到需要加上这个属性来规避bug
options.add_argument('--hide-scrollbars') # 隐藏滚动条, 应对一些特殊页面
options.add_argument('blink-settings=imagesEnabled=false') # 不加载图片, 提升速度
options.add_argument('--headless') # 浏览器不提供可视化页面. linux下如果系统不支持可视化不加这条会启动失败
browser = webdriver.Firefox(options=options)
browser.maximize_window()
# browser = webdriver.Chrome()# 启用谷歌浏览器内核
# browser = webdriver.Firefox()# 启用火狐浏览器内核
browser.get(url)
# time.sleep(10)
try:
button = browser.find_element_by_id('sufei-dialog-close') # 产生点击事件,关闭登录框
button.click()
except:
pass
# wait = WebDriverWait(browser, 10) # 等待
# browser.maximize_window() # 最大化窗口
# 展示图一
try:
img1 = browser.find_element_by_css_selector('#J_UlThumb > li:nth-child(1) > a:nth-child(1) > img:nth-child(1)').get_attribute('src')
img1 = img1.replace('60x60','430x430')
except:
img1 = ' '
# print('展示图一:',img1)
# 展示图二
try:
img2 = browser.find_element_by_css_selector('li.tb-selected:nth-child(2) > a:nth-child(1) > img:nth-child(1)').get_attribute('src')
img2 = img2.replace('60x60','430x430')
except:
img2 = ' '
# print('展示图二:',img2)
# 展示图三
try:
img3 = browser.find_element_by_css_selector('#J_UlThumb > li:nth-child(3) > a:nth-child(1) > img:nth-child(1)').get_attribute('src')
img3 = img3.replace('60x60','430x430')
except:
img3 = ' '
# print('展示图三:',img3)
# 展示图四
try:
img4 = browser.find_element_by_css_selector('#J_UlThumb > li:nth-child(4) > a:nth-child(1) > img:nth-child(1)').get_attribute('src')
img4 = img4.replace('60x60','430x430')
except:
img4 = ' '
# print('展示图四:',img4)
# 展示图五
try:
img5 = browser.find_element_by_css_selector('#J_UlThumb > li:nth-child(5) > a:nth-child(1) > img:nth-child(1)').get_attribute('src')
img5 = img5.replace('60x60','430x430')
except:
img5 = ' '
# print('展示图五:',img5)
# 详情网址
# print('详情网址:',url)
# 商品标题
try:
title1 = browser.find_element_by_css_selector('.tb-detail-hd > h1:nth-child(1)').text
except:
title1 = ' '
print('商品标题:',title1)
# 店铺名称
try:
set_meal1 = browser.find_element_by_css_selector(".slogo-shopname > strong:nth-child(1)").text
except:
set_meal1 = ' '
print('店铺名称:',set_meal1)
# 发货地点
try:
address1 = browser.find_element_by_id('J_deliveryAdd').text
except:
address1 = ' '
print('发货地点:',address1)
# 产品价格
try:
price1 = browser.find_element_by_class_name("tm-price").text
except:
price1 = ' '
print('产品价格:',price1)
# 当月销量
try:
mouth_sales1 = browser.find_element_by_xpath('//*[@id="J_DetailMeta"]/div[1]/div[1]/div/ul/li[1]/div/span[2]').text
except:
mouth_sales1 = ' '
print('当月销量:', mouth_sales1)
# 评价人数
try:
evaluation1 = browser.find_element_by_id('J_ItemRates').text
evaluation1 = evaluation1.replace('累计评价','') # 替换字符‘累计评价’为空
except:
evaluation1 = ' '
print('评价人数:',evaluation1)
# 收藏人气
try:
collect1 = browser.find_element_by_id('J_CollectCount').text
collect1 = collect1.replace('人气','') # 替换字符
collect1 = collect1.replace('(','')
collect1 = collect1.replace(')','')
except:
collect1 = ' '
print('收藏人气:',collect1)
# 评分描述
try:
ms = browser.find_element_by_xpath('//*[@id="side-shop-info"]/div').text
ms1 = ms.split('\n')
ms1 = ms1[4:5]
grade1 = ms1[0]
except:
grade1 = ' '
print('评分描述:',grade1)
#套餐属性
try:
combo1 = browser.find_element_by_css_selector('#J_DetailMeta > div.tm-clear > div.tb-property > div > div.tb-key > div > div').text
combo1 = combo1.split('\n')
combo1 = combo1[0:-1:1]
except:
combo1 = ' '
print('属性套餐:', combo1)
# 商品参数
try:
details1 = browser.find_element_by_css_selector(
'#J_AttrUL').text
details1 = details1.split('\n')
except:
details1 = ' '
print('商品详情', details1)
# with open('天猫商品详情信息.csv', 'a',newline ='',encoding='utf-8-sig') as fp:
# csv_writer = csv.writer(fp, delimiter=',')
# csv_writer.writerow([
# url,img1,img2,img3,img4,img5,title1,set_meal1,address1,price1,mouth_sales1,evaluation1,collect1,grade1,
# child_attributes1
# ])
# try:
# info1 = [url,img1,img2,img3,img4,img5,title1,set_meal1,address1,price1,mouth_sales1,evaluation1,collect1,grade1,combo1,details1]
# tmall_save(info1)
# except:
# pass
try:
info1 = (
{'details_url': url, 'img1': img1, 'img2': img2, 'img3': img3, 'img4': img4, 'img5': img5, 'title': title1,
'set_meal': set_meal1, 'address': address1, 'price': price1, 'mouth_sales':mouth_sales1,'evaluation': evaluation1, 'collect':collect1,'grade':grade1,'combo': combo1, 'details': details1})
tmall_save(info1)
except:
pass
# 关闭内核浏览器
browser.close()
browser.quit()
#淘宝信息存储
# def taobao_save(list):
# csvFile = open('淘宝商品详情信息.csv', 'a', newline='', encoding='utf-8-sig') # 设置newline,否则两行之间会空一行
# writer = csv.writer(csvFile)
# writer.writerow(list)
# csvFile.close()
#天猫信息存储
# def tmall_save(list1):
# csvFile = open('天猫商品详情信息.csv', 'a', newline='', encoding='utf-8-sig') # 设置newline,否则两行之间会空一行
# writer = csv.writer(csvFile)
# writer.writerow(list1)
# csvFile.close()
#配置mongodb
def taobao_save(result):
client = pymongo.MongoClient('localhost', 27017)
dbname = client['spider']
MONGO_TABLE = dbname['tbxq']
MONGO_TABLE.insert(result)
#配置mongodb
def tmall_save(result):
client = pymongo.MongoClient('localhost', 27017)
dbname = client['spider']
MONGO_TABLE = dbname['tmxq']
MONGO_TABLE.insert(result)
if __name__ =='__main__':
ky = ''
# csv文件标题
# headers = ['详情链接','展示图1','展示图2','展示图3','展示图4','展示图5','商品标题','店铺名称','发货地点','产品价格','当月销量','评价人数','收藏人气','评分描述','套餐属性','详情信息'
# ]
# taobao_save(headers)
# tmall_save(headers)
get_detail(ky)