一、原理
selenium的原理很简单,就是模拟人对浏览器的操作,人是怎么操作的,在编写代码时就以这个为逻辑来进行编写。编写起来很是简单,并且也能够很容易纠错。
缺点是,速度比较慢,抓取起来耗时,并且经常容易弹出验证码,还未找出好办法解决
二、代码部分
代码部分没有加try…except语句,有需要的可以自己添加
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import re
import time
import random
import pandas as pd
def spiders():
rand = random.randint(5,10)#设置随机时间
opt = Options()
#规避网站后台可以根据window.navigator.webdriver返回值进行selenium的监测
opt.add_experimental_option("excludeSwitches", ['enable-automation'])
#为浏览器添加头部信息
opt.add_argument('–user-agent="Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36"')
#开启隐身模式
opt.add_argument("–incognito")
#传参
web = webdriver.Chrome(options=opt)
web.get("https://www.meituan.com/meishi/2484377/")
web.implicitly_wait(10)
#找寻店铺基本信息
shop_name = web.find_element_by_xpath("//*[@id=\"app\"]/section/div/div[2]/div[1]/div[1]").text[6::]
shop_score = web.find_element_by_xpath("//*[@id=\"app\"]/section/div/div[2]/div[1]/div[2]/p").text[0]
address = web.find_element_by_xpath("//*[@id=\"app\"]/section/div/div[2]/div[1]/div[3]/p[1]").text[3::]
time.sleep(3)
target = web.find_element_by_xpath("//*[@id=\"app\"]/section/div/div[3]/div[1]/div[3]/div[2]/div[2]/div[11]")
#需要滑动到评论区才能让浏览器点击到按钮,否则会被遮盖
web.execute_script("arguments[0].scrollIntoView();", target)
l = []
u = []
s = []
sum = 0
for i in range(350):
sum += 1
print("正在抓取第{}页".format(sum))
if web.find_element_by_xpath("//*[@id=\"app\"]/section/div/div[3]/div[1]/div[3]/div[2]/div[2]/div[11]/ul/li[8]/span").get_attribute("iconfont icon-btn_right disabled"):
print("完成")
else:
name = web.find_elements_by_xpath("//div[@class=\"list clear\"]/div[@class=\"info\"]/div[@class=\"name\"]")
comment = web.find_elements_by_xpath("//div[@class=\"list clear\"]/div[@class=\"info\"]/div[@class=\"desc\"]")
star = web.find_elements_by_xpath("//div[@class=\"source\"]//ul[@class=\"stars-ul stars-light\"]")
web.implicitly_wait(3)
for u_name in name:
l.append(u_name.text)
dataframe = pd.DataFrame({"name":l})
web.implicitly_wait(3)
for u_commt in comment:
u.append(u_commt.text)
web.implicitly_wait(3)
for stars in star:
len = stars.get_attribute("style")
tar = r" (\d{2,}\.\d{1,2})"
star1 = re.findall(tar, len)
if star1 != [] :
star1 = "".join(star1)
user_score = float(star1)/16.8
else:
user_score = 5.0
s.append(user_score)
#将拿下来的数据存储到DataFrame里
dataframe = pd.DataFrame({"店铺名":shop_name,"店铺评分":shop_score,"地址":address,"用户名":l,"评分": s,"评论":u,})
time.sleep(rand)
web.find_element_by_xpath("//span[@class = \"iconfont icon-btn_right\"]").click()
time.sleep(rand)
#将数据存储到excel里,也可以存入csv
dataframe.to_excel("美团评论.xlsx",encoding='utf_8_sig')
#encoding最好使用utf_8_sig不然容易出现乱码
print(spiders())