本文使用python的selenium库模拟点击爬取携程景点的景点信息和评论信息,自动过滤景点评论不超过10条的景点。
使用selenium库优点
优点:不容易被ban
缺点:时间长
安装seleniumm
pip install selenium
下载浏览器驱动
http://chromedriver.storage.googleapis.com/index.html
http://npm.taobao.org/mirrors/chromedriver/
驱动和py文件要放在一个文件夹下
from selenium import webdriver
from time import sleep
import csv
from selenium.webdriver.common.by import By
import xlwt # 进行excel操作
#谷歌驱动 告诉电脑在哪打开浏览器
# edge_options=webdriver.EdgeOptions()
# edge_options.add_argument('--headless')
# edge_options.add_argument('--disable-gpu')
# driver=webdriver.Edge(options=edge_options)
driver=webdriver.Edge()
#打开网页
driver.get("https://hotels.ctrip.com/?allianceid=4897&sid=798178&bd_vid=8152353566163309773")#打开网页
#通过xpath点击搜索
driver.find_element(By.XPATH,r'//*[@id="_allSearchKeyword"]').send_keys('香港')#输入关键词
driver.implicitly_wait(2)#隐式休息20s 登录携程
driver.find_element(By.XPATH,r'//*[@id="search_button_global"]').click()#点击搜索,点一次可能没反应
driver.implicitly_wait(2)#隐式休息20s 登录携程
driver.find_element(By.XPATH,r'//*[@id="search_button_global"]').click()
driver.implicitly_wait(2)#隐式休息20s 登录携程
driver.find_element(By.XPATH,r'//*[@id="search_button_global"]').click()
driver.implicitly_wait(10)#隐式休息20s 登录携程
driver.find_element(By.XPATH,r'//*[@id="search_button_global"]').click()
driver.implicitly_wait(2)#隐式休息20s 登录携程
driver.find_element(By.XPATH,r'//*[@id="search_button_global"]').click()
driver.implicitly_wait(2)#隐式休息20s 登录携程
# print(driver.window_handles)
driver.switch_to.window(driver.window_handles[-1])#切换到新标签页
driver.maximize_window()#窗口最大化
driver.find_element(By.XPATH,r'//*[@id="__next"]/div/div/div/div[3]/div[1]').click()#点击景点
driver.implicitly_wait(5)#隐式休息20s 登录携程
driver.find_element(By.XPATH,r'//*[@id="__next"]/div/div/div/div[3]/div[1]').click()
driver.implicitly_wait(10)
######存储信息
names=[]
introduces=[]
counts=[]
contents=[]
driver.switch_to.window(driver.window_handles[-1])
# driver.execute_script("window.scrollTo(0,document.body.scrollHeight)")#下滑至底部
driver.implicitly_wait(10)
for j in range(10):#爬取前十页景点信息
for i in range(11):#进入前十个景点
if i != 5:#第五个是广告
driver.implicitly_wait(5)
driver.execute_script('window.scrollBy(0,180)')#下滑
driver.implicitly_wait(20)
count=0
driver.find_element(By.XPATH,'//*[@id="content"]/div[4]/div/div[2]/div/div[3]/div['+str(i+1)+']/div[1]').click()#进入景点
driver.implicitly_wait(10)
driver.switch_to.window(driver.window_handles[-1])
names.append(driver.find_element(By.XPATH,r'//*[@id="__next"]/div[3]/div/div[3]/div[2]').text)#搜集需要的信息
introduces.append(driver.find_element(By.XPATH,r'//*[@id="__next"]/div[3]/div/div[4]/div[1]/div[2]/div').text)
contents.append(driver.find_element(By.XPATH,r'//*[@id="commentModule"]').text)
count+=1#用来判断搜集道德评论数量
######读取前十页评论
try:
for k in range(3):
driver.execute_script('window.scrollBy(0,1800)')
driver.find_element(By.XPATH,'//*[@id="commentModule"]/div[6]/ul/li[9]/span/a').click()
driver.implicitly_wait(10)
driver.execute_script("var q=document.documentElement.scrollTop=0")
contents.append(driver.find_element(By.XPATH,r'//*[@id="commentModule"]').text)
count+=1
driver.execute_script('window.scrollBy(0,1800)')
driver.find_element(By.XPATH,'//*[@id="commentModule"]/div[6]/ul/li[10]/span/a').click()
driver.implicitly_wait(10)
driver.execute_script("var q=document.documentElement.scrollTop=0")
contents.append(driver.find_element(By.XPATH,r'//*[@id="commentModule"]').text)
count+=1
for k in range(5):
driver.execute_script('window.scrollBy(0,1800)')
driver.find_element(By.XPATH,'//*[@id="commentModule"]/div[6]/ul/li[11]/span/a').click()
driver.implicitly_wait(10)
driver.execute_script("var q=document.documentElement.scrollTop=0")
contents.append(driver.find_element(By.XPATH,r'//*[@id="commentModule"]').text)
count+=1
except:
pass
if count<=1:#如果评论小于十条出栈
names.pop()
introduces.pop()
contents.pop()
driver.close()#关闭标签页
driver.switch_to.window(driver.window_handles[-1])#返回上一个界面
# driver.execute_script('window.scrollBy(0,1500)')
driver.implicitly_wait(10)
driver.find_element(By.XPATH,'//*[@id="content"]/div[4]/div/div[2]/div/div[3]/div[12]/div/a[7]').click()#切换下十个景点
driver.implicitly_wait(10)
driver.execute_script("var q=document.documentElement.scrollTop=0")#回到顶部
driver.implicitly_wait(5)#隐式休息20s 登录携程
#存储数据
col = ["介绍","简介","评论"]
with open('携程景点5.csv','a', encoding='utf-8-sig')as w:
wr=csv.DictWriter(w,col)
wr.writeheader()
# writer = csv.writer(w)
# for i in range(len(names)):
x={"介绍":names,"简介":introduces,"评论":contents}
wr.writerow(x)
print("关闭浏览器,保存数据")
这份代码有一些有缺陷的地方,随后更新新代码。