爬遍电商之淘宝篇
今天用selenium实现淘宝评论抓取,首先当然是解决登录问题了,不过目前是手动登录,日后有时间再细细研究研究,他这个登录确实有点牛皮
采用cmd打开chrome的方式,然后用selenium接管即可,手动输入账号密码登录成功,再关闭提示框,爬虫立马开始工作
def login_taobao():
# 打开本地chrome,同时打开直通车登录页面,需要提前配置环境变量path
os.system('cd "C:\\Program Files (x86)\\Google\\Chrome\\Application"&start chrome.exe --remote-debugging-port=9999 --user-data-dir="C:\selenum\AutomationProfile" "https://login.taobao.com/member/login.jhtml?"')
chrome_debug_port = 9999
chrome_options = Options()
chrome_options.add_experimental_option("debuggerAddress", f"127.0.0.1:{chrome_debug_port}")
# selenium接管当前网页
browser = webdriver.Chrome(options=chrome_options)
browser.maximize_window()
#一个账号密码提示框,也是重要的暂停代码的方式
time_count()
print(browser.title)
return browser
这是在time_count_for_taobao.py中的账号密码提示框
import tkinter
def close():
root.destroy()
def time_count():
global root
#创建应用程序窗口,设置标题和大小
root = tkinter.Tk()
root.wm_attributes('-topmost',1)
root.title('我只是一个计时器')
root['width'] = 200
root['height'] = 110
#不允许改变窗口大小
root.resizable(False, False)
#创建Text组件,放置一些文字
richText = tkinter.Text(root, width=50)
richText.place(x=10, y=10, width=180, height=30)
richText.insert('0.0', '你的账号')
eeText = tkinter.Text(root, width=50)
eeText.place(x=10, y=40, width=180, height=30)
eeText.insert('0.0', '你的密码')
#显示关闭按钮
buff = tkinter.Button(root,text="关闭",command=close)
buff.place(x=70, y=75, width=50, height=30)
root.mainloop()
登录之后得先知道你想爬哪款商品,我这边随便来了个奶粉的商品,确定评论和下一页的XPATH就可以开动啦
https://detail.tmall.com/item.htm?id=35582751706&scene=taobao_shop
但是你会发现用selenium直接click()累计评论是100%会报错,原因就是淘宝貌似有轨迹验证?一定得滚动一下直到这个累计评论出现,翻页倒是没啥关系
while True:
try:
brower.find_element_by_xpath('//div[@id="J_TabBarBox"]/ul/li[2]/a').click()
break
except:
brower.execute_script('window.scrollTo(0,'+str(100+random.random()*30)+')')
time.sleep(random.random())
接着就是获取用户名、评论内容、评论时间、追加评论,标着123的是正常没有追加评论,标着1234的是有追加评论的,区别就是中间多了一个class=“tm-rate-premiere”就这边稍微有点不同,一个if else就搞定
def get_pinglun_info(text):
source = etree.HTML(text)
user_name = re.findall('<div class="rate-user-info">(.*?)</div>',text)
info_list = source.xpath('//div[@class="rate-grid"]/table/tbody/tr')
for i in range(len(info_list)):
item = {}
item['用户名'] = user_name[i].replace('<span>','').replace('</span>','')
if info_list[i].xpath('./td[1]/div[@class="tm-rate-premiere"]'):
item['评论内容'] = info_list[i].xpath('./td[1]/div[@class="tm-rate-premiere"]//div[@class="tm-rate-content"]/div[@class="tm-rate-fulltxt"]/text()')[0]
item['评论时间'] = info_list[i].xpath('./td[1]/div[@class="tm-rate-premiere"]/div[@class="tm-rate-tag"]//div[@class="tm-rate-date"]/text()')[0]
item['追加评论'] = info_list[i].xpath('./td[1]/div[@class="tm-rate-append"]//div[@class="tm-rate-content"]/div[@class="tm-rate-fulltxt"]/text()')[0]
else:
item['评论内容'] = info_list[i].xpath('./td[1]/div[@class="tm-rate-content"]/div[@class="tm-rate-fulltxt"]/text()')[0]
item['评论时间'] = info_list[i].xpath('./td[1]/div[@class="tm-rate-date"]/text()')[0]
item['追加评论'] = ''
print(item)
data_my.append(item)
最后一点就是当你翻页的间隔太短时,会跳验证码,我这边是让代码暂停,手动过掉验证码,然后继续执行代码,秘诀就是用那个提示框,不过设置成5秒翻页一次,基本不会跳验证码,好了全部代码放上
# -*- coding: utf-8 -*-
import os
import re
import time
import random
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import pandas as pd
from lxml import etree, html
from time_count_for_taobao import time_count
data_my=[]
def main(path):
brower=login_taobao()
brower.get(path)
time.sleep(3+random.random())
while True:
try:
brower.find_element_by_xpath('//div[@id="J_TabBarBox"]/ul/li[2]/a').click()
break
except:
brower.execute_script('window.scrollTo(0,'+str(100+random.random()*30)+')')
time.sleep(random.random())
time.sleep(2+random.random())
get_pinglun_info(brower.page_source)
print('-------第1页爬取完成-------')
for i in range(2,120):
try:
#检测有无验证码
brower.find_element_by_xpath('//div[contains(@role,"dialog") and contains(@aria-hidden,"false")]')
print('-------出现验证码-------')
time_count()
except:
print('------无验证码,安全-------')
try:
#翻页
next_page=brower.find_element_by_xpath('//div[@class="rate-page"]/div[@class="rate-paginator"]//a[contains(text(),"下一页>>")]')
brower.execute_script("arguments[0].click();", next_page)
time.sleep(5+random.random())
get_pinglun_info(brower.page_source)
print('-------第%s页爬取完成-------'%i)
except:
print('没有更多评论了,总共抓取评论%s条'%len(data_my))
df = pd.DataFrame(data_my)
df.to_csv('./taobao_pinglun.csv', index=None, encoding='utf-8-sig', mode='a')
brower.quit()
def login_taobao():
# 打开本地chrome,同时打开直通车登录页面,需要提前配置环境变量path
os.system('cd "C:\\Program Files (x86)\\Google\\Chrome\\Application"&start chrome.exe --remote-debugging-port=9999 --user-data-dir="C:\selenum\AutomationProfile" "https://login.taobao.com/member/login.jhtml?"')
chrome_debug_port = 9999
chrome_options = Options()
chrome_options.add_experimental_option("debuggerAddress", f"127.0.0.1:{chrome_debug_port}")
# selenium接管当前网页
browser = webdriver.Chrome(options=chrome_options)
browser.maximize_window()
time_count()
print(browser.title)
return browser
def get_pinglun_info(text):
source = etree.HTML(text)
user_name = re.findall('<div class="rate-user-info">(.*?)</div>',text)
info_list = source.xpath('//div[@class="rate-grid"]/table/tbody/tr')
for i in range(len(info_list)):
item = {}
item['用户名'] = user_name[i].replace('<span>','').replace('</span>','')
if info_list[i].xpath('./td[1]/div[@class="tm-rate-premiere"]'):
item['评论内容'] = info_list[i].xpath('./td[1]/div[@class="tm-rate-premiere"]//div[@class="tm-rate-content"]/div[@class="tm-rate-fulltxt"]/text()')[0]
item['评论时间'] = info_list[i].xpath('./td[1]/div[@class="tm-rate-premiere"]/div[@class="tm-rate-tag"]//div[@class="tm-rate-date"]/text()')[0]
item['追加评论'] = info_list[i].xpath('./td[1]/div[@class="tm-rate-append"]//div[@class="tm-rate-content"]/div[@class="tm-rate-fulltxt"]/text()')[0]
else:
item['评论内容'] = info_list[i].xpath('./td[1]/div[@class="tm-rate-content"]/div[@class="tm-rate-fulltxt"]/text()')[0]
item['评论时间'] = info_list[i].xpath('./td[1]/div[@class="tm-rate-date"]/text()')[0]
item['追加评论'] = ''
print(item)
data_my.append(item)
if __name__ == '__main__':
path='https://detail.tmall.com/item.htm?id=35582751706&scene=taobao_shop'
main(path)
强大如淘宝,4W+评论只让我爬了99页,一共是1980条评论,最后结果如下