耗时三天学会了selenium框架下的爬虫,学校要求做个课设,需要些用户评论的数据,分别为酒店用户评论数据,景点用户评论数据和饭店用户评论数据;携程的酒店信息很难爬取,我就放弃了,但是爬下了携程网站上有关景点的数据,时间紧,代码能用即可,没有优化。
我是一个小白,应该没有比我还白的小白拜读我这篇blog了吧(狗头)。。
代码示例:
# -*- coding: utf-8 -*-
"""
Created on Thu Sep 3 17:06:24 2020
@author: Administrator
"""
# coding=utf-8
from selenium import webdriver
from selenium.webdriver import ActionChains
import time
driver = webdriver.Chrome()
url = 'https://hotel.tuniu.com/detail/5404487?checkInDate=2020-09-04&checkOutDate=2020-09-05'
driver.get(url)
#tag=2
#n = 'HDBW'
##3_爬取好评;4_爬取差评(景点)
#
##2_爬取好评;6_爬取差评(美食)
#if tag==2:
# s = 'GOOD'
#else:
# s = 'BAD'
#time.sleep(2)
#driver.find_element_by_xpath('//*[@id="pubGlobal_main_content2"]/div[2]/div/a').click()
#time.sleep(2)
#driver.find_element_by_xpath('//*[@id="nloginname"]').send_keys('18742045033')
#driver.find_element_by_xpath('//*[@id="npwd"]').send_keys('528000jianpi@')
#time.sleep(2)
#huakuai = driver.find_element_by_xpath('//*[@id="sliderddnormal"]')
#*****************************************爬取好评最新时间段的5页*******************************************
##景点参数3/4
#driver.find_element_by_xpath('//*[@id="commentModule"]/div[3]/span[%d]'%(tag)).click()#点击好评或差评
#driver.find_element_by_xpath('//*[@id="commentModule"]/div[4]/span[2]').click()#点击时间顺序
#美食参数2/6
#time.sleep(5)
##driver.find_element_by_xpath('/html/body/div[3]/div/div[1]/div[3]/div[2]/div[3]/ul/li[%d]/a'%(tag)).click()#点击好评或差评
#driver.find_element_by_xpath('//*[@id="selectSort"]/ul/li[3]/a').click()#点击时间顺序
#driver.find_element_by_xpath('//*[@id="commentModule"]/div[3]/span[%d]'%(tag)).click()#点击好评或差评
#driver.find_element_by_xpath('//*[@id="commentModule"]/div[4]/span[2]').click()#点击时间顺序
#a1 = []
#a2 = []
#for i in range(1,11):#爬5页
# if i<6:
# # page = driver.find_element_by_xpath("//*[@id='commentModule']/div[6]/ul/li[10]/div/input").send_keys(i+1)
# content = driver.find_elements_by_class_name('commentDetail')
# for u in content:
## print(u.text)
# a1.append(u.text)
# # a_str = "\n".join(a)
# file = open('ctrip_SSpot_BingHaiStreet_%s_new_50.txt'%(s),'w',encoding='utf-8')
# for j in range(len(a1)):
# file.write(str(a1[j])+'\n'+'\n')
# file.close()#写出文件
# print(" page" ,i," finished ")
# if i<4:
# driver.find_element_by_xpath('//*[@id="commentModule"]/div[6]/ul/li[9]/span/a').click()#翻页
# elif i==4:
# driver.find_element_by_xpath('//*[@id="commentModule"]/div[6]/ul/li[10]/span/a').click()#翻页
# else:
# driver.find_element_by_xpath('//*[@id="commentModule"]/div[6]/ul/li[11]/span/a').click()#翻页
# time.sleep(5)
# else:
# content = driver.find_elements_by_class_name('commentDetail')
# for u in content:
# # print(u.text)
# a2.append(u.text)
# # a_str = "\n".join(a)
# file = open('ctrip_SSpot_BingHaiStreet_%s_old_50.txt'%(s),'w',encoding='utf-8')
# for j in range(len(a2)):
# file.write(str(a2[j])+'\n'+'\n')
# file.close()#写出文件
# print(" page" ,i," finished ")
# if i<4:
# driver.find_element_by_xpath('//*[@id="commentModule"]/div[6]/ul/li[9]/span/a').click()#翻页
# elif i==4:
# driver.find_element_by_xpath('//*[@id="commentModule"]/div[6]/ul/li[10]/span/a').click()#翻页
# else:
# driver.find_element_by_xpath('//*[@id="commentModule"]/div[6]/ul/li[11]/span/a').click()#翻页
# time.sleep(5)
##driver.refresh()
#print('OVER!!')
#driver.close()
#*****************************************爬取好评最新时间段的5页*******************************************