淘宝反爬策略:
使用selenium的webdriver模块伪装成浏览器,然后再用
ActionChains模块模拟整个滑动滑块解锁的过程,成功登陆淘宝账号,抓取数据并写入excel。
from selenium import webdriver
from selenium.webdriver import ActionChains
import time
import re
import xlwt
browser = webdriver.Chrome()
browser.get("https://s.taobao.com/search?q=%E5%A6%AE%E7%BB%B4%E9%9B%85") #q后面是要搜索的产品名称
#输入账号
browser.find_element_by_xpath('//*[@id="fm-login-id"]').send_keys('手机号xxx')
time.sleep(1.5)
#输入密码
browser.find_element_by_xpath('//*[@id="fm-login-password"]').send_keys('密码xxx')
time.sleep(2)
#滑块验证
button = browser.find_element_by_id('nc_1_n1z')
action = ActionChains(browser)
action.click_and_hold(button).perform()
action.reset_actions()
action.move_by_offset(258, 0).perform()
time.sleep(1.5)
#点击登录
browser.find_element_by_xpath('//*[@id="login-form"]/div[4]/button').click()
time.sleep(3)
#获取源代码
data = browser.page_source
# print(data)
#获取总页数
p_pages = '"pager":{"pageSize":.*?"totalPage":(.*?),"currentPage"'
pages = int(re.findall(p_pages,data,re.S)[0])
#获取全部源代码
datas=[]
datas.append(data)
for i in range(2): #若要抓取全部可以换成pages,这里只抓取前两页
browser.find_element_by_xpath('//*[@id="mainsrp-pager"]/div/div/div/ul/li[8]/a').click()
time.sleep(5)
data = browser.page_source
datas.append(data)
time.sleep(3)
alldata = ''.join(datas)
#关闭浏览器窗口
browser.quit()
#抓取的内容
n_name='<span class="H">妮维雅</span>(.*?)</a>'
name = re.findall(n_name,alldata,re.S)
p_price = '<div class="row row-2 title">.*?trace-price="(.*?)" trace-pid=.*?">'
price = re.findall(p_price,alldata,re.S)
p_sales = '"view_sales":"(.*?)"'
sales = re.findall(p_sales,alldata,re.S)
p_detail_url='"detail_url":"(.*?)"'
detail_url=re.findall(p_detail_url,alldata,re.S)
data_list_content = []
content=[]
for i in range(len(name)):
content = name[i].strip()+','+price[i]+','+sales[i]+','+'http:'+detail_url[i].encode('latin-1').decode('unicode_escape')
content =content.split(',')
data_list_content.append(content)
print(data_list_content)
#将抓取内容保存至excel
work_book = xlwt.Workbook(encoding='utf-8') #创建excel表格
sheet = work_book.add_sheet('NWY') #创建一个sheet
heads = ['name', 'price', 'sales', 'detail_url'] #写入表头
for i in range(len(heads)):
sheet.write(0, i, heads[i])
#写入抓取的具体内容
i=1
for content in data_list_content:
j=0
for cont in content:
sheet.write(i, j, cont)
j += 1
i += 1
work_book.save('nwy.xls')