这是帮忙工商学院研究生院旅游管理专业的舒老师弄的一个爬虫项目,简单的说算是三个网站一起的爬虫,分别爬取携程网、大众点评和马蜂窝马蜂窝的差评信息,仅限于差评,用于论文研究;上次曾有幸受他邀请,教研究生学长学姐们怎么用python做爬虫,但是爬虫的很多反爬和信息处理能力不是一两天的时间就可以让小白入门的。比如fa这三个网站,都有一些反爬虫,特别是大众点评,字体反爬,爬虫软件几乎拿它没辙,最多拿到缺失的内容。为了开发速度,我也没有去研究携程网和马蜂窝的ajax加载,直接选择了最为粗暴的selenium自动化。其中在点击网页节点上也是费了一些功夫,虽然不就前也弄过它们,但是很多反爬的措施也更新的很快,只能重新开发了。经过这个时间的学习,虽然进步了很多,在爬虫上,更加相信一定要学好js了。中国加油,武汉加油 ,我也加油~ ~ ~
目标文件和运行结果下载: https://www.lanzous.com/i9f3xwh
- 目标文件如图:
任务: 需要爬取表格中对应的链接,并把爬取的内容存在和景点对应的txt文件中,如果评论数为0,则不需要爬取。
1、携程网
1.1、PC端差评代码
import requests,json,time,random
from fake_useragent import UserAgent
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options
from lxml import etree
from lxml import html
import pandas as pd
startTime = time.time() #记录起始时间
driver = webdriver.Chrome()
driver.implicitly_wait(5)
chrome_option = webdriver.ChromeOptions()
chrome_option.add_argument('--proxy--server=112.84.55.122:9999')#使用代理IP
def you_ctrip(file_name, xc_url):
driver.get(xc_url)
driver.implicitly_wait(10)
driver.execute_script("window.scrollBy(0,1600)")
driver.implicitly_wait(5)
# 点击差评
button = driver.find_element_by_xpath('//*[@id="weiboCom1"]/div[2]/ul/li[5]/a')
driver.execute_script("$(arguments[0]).click()",button)
driver.implicitly_wait(5)
driver.execute_script("window.scrollBy(0,1000)")
try:
PageNunber = driver.find_element_by_xpath('//div[@class="pager_v1"]/span/b').text
except:
PageNunber = False
source = driver.page_source
you_ctrip_spider(source, file_name)
if PageNunber:
print ("PageNunber = ", PageNunber)
for i in range(2, int(PageNunber)+1):
time.sleep(2)
print ("@"*50)
search = driver.find_element_by_id('gopagetext') #定位搜索框节点
search.send_keys(i)#输入搜素词
search.send_keys(Keys.ENTER)#点击回车
driver.execute_script("window.scrollBy(0,10000)")
time.sleep(1)
source = driver.page_source
you_ctrip_spider(source, file_name)
button = driver.find_element_by_xpath('//*[@id="weiboCom1"]/div[2]/ul/li[6]/a')
driver.execute_script("$(arguments[0]).click()",button)
driver.implicitly_wait(5)
try:
PageNunber = driver.find_element_by_xpath('//div[@class="pager_v1"]/span/b').text
except:
PageNunber = False
# 获取源码并解析
source = driver.page_source
you_ctrip_spider(source, file_name)
if PageNunber:
print ("PageNunber = ", PageNunber)
for i in range(2, int(PageNunber)+1):
time.sleep(2)
print ("@"*50)
search = driver.find_element_by_id('gopagetext') #定位搜索框节点
search.send_keys(i)#输入搜素词
search.send_keys(Keys.ENTER)#点击回车
driver.execute_script("window.scrollBy(0,10000)")
time.sleep(1)
# 获取源码并解析
source = driver.page_source
you_ctrip_spider(source, file_name)
def you_ctrip_spider(source, file_name):
xc_html = html.fromstring(source)
# 提取全部评论
xc_user_comments = xc_html.xpath('//li[@class="main_con"]/span/text()')
xc_user_comment = "".join(xc_user_comments)
print ("xc_user_comment = ", xc_user_comment)
with open(file_name, "a", encoding="utf-8") as f:
f.write(xc_user_comment+"\n")
f.close()
def main():
file_name = './景点差评测试.txt'
max_comment = 41
if int(max_comment) != 0:
maxSlide = int(max_comment / 10)
xc_url = "https://you.ctrip.com/sight/guiding120451/145654.html"
if "sight" in xc_url:
you_ctrip(file_name, xc_url)
if __name__ == '__main__':
main()
1.2、移动端差评代码
import requests,json,time,random
from fake_useragent import UserAgent
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from lxml import etree
from lxml import html
import pandas as pd
# 读取表格
data = pd.read_excel('./差评统计.xlsx')
startTime = time.time() #记录起始时间
driver = webdriver.Chrome()
driver.implicitly_wait(5)
chrome_option = webdriver.ChromeOptions()
chrome_option.add_argument('--proxy--server=112.84.55.122:9999')#使用代理IP
def spider_xiecheng(maxSlide, file_name, xc_url):
driver.get(xc_url+ '&tag=-12') # &tag=-12是网页上差评的标签
# print ("maxSlide===========================", maxSlide)
# selenium下拉滚动条
if int(maxSlide) > 0:
# print ("$"*50)
for i in range(0, int(maxSlide)):
driver.execute_script("window.scrollTo(0,10000)")
time.sleep(1) #暂停时间:2~3秒
# 获取源码并解析
time.sleep(2)
source = driver.page_source
xc_html = html.fromstring(source)
# 提取全部评论
xc_user_comments = xc_html.xpath('//*[@id="c_gs_comments_commentdetail"]//text()')
xc_user_comment = "".join(xc_user_comments)
# print ("xc_user_comment = ", xc_user_comment)
# 提取全部回复
seller_replys = xc_html.xpath('//div[@class="seller-reply"]//text()')
seller_reply = "".join(seller_replys)
# print ("seller-reply = ", seller_reply)
# 保存数据
with open(file_name, "a", encoding="utf-8") as f:
f.write(xc_user_comment+"\n")
f.write(seller_reply+"\n")
f.close()
def main():
for i in range(0,96):
# print (data['高级别景区'][i], data['携程差评'][i], data['携程'][i])
file_name = './景点差评/'+str(data['高级别景区'][i]) + '.txt'
max_comment = int(data['携程差评'][i])
if int(max_comment) != 0:
maxSlide = int(max_comment / 10)
xc_url = data['携程'][i]
spider_xiecheng(maxSlide, file_name, xc_url)
else:
print ("携程网《%s》没有差评"%(data['高级别景区'][i]))
print ("正在爬取第%s个目标,一共有97个目标"%(i+1))
if __name__ == '__main__':
main()
1.3、PC端和移动端一起运行
注意:表格中携程网的链接不统一,有移动端和PC端,所以要分开进行爬取
import requests,json,time,random
from fake_useragent import UserAgent
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from lxml import etree
from lxml import html
import pandas as pd
data = pd.read_excel('./差评统计.xlsx')
startTime = time.time() #记录起始时间
driver = webdriver.Chrome()
driver.implicitly_wait(5)
chrome_option = webdriver.ChromeOptions()
chrome_option.add_argument('--proxy--server=112.84.55.122:9999')#使用代理IP
def you_ctrip(file_name, xc_url):
driver.get(xc_url)
driver.implicitly_wait(10)
driver.execute_script("window.scrollBy(0,1600)")
driver.implicitly_wait(5)
# 点击差评
button = driver.find_element_by_xpath('//*[@id="weiboCom1"]/div[2]/ul/li[5]/a')
driver.execute_script("$(arguments[0]).click()",button)
driver.implicitly_wait(5)
driver.execute_script("window.scrollBy(0,1000)")
try:
PageNunber = driver.find_element_by_xpath('//div[@class="pager_v1"]/span/b').text
except:
PageNunber = False
source = driver.page_source
you_ctrip_spider(source, file_name)
if PageNunber:
# print ("PageNunber = ", PageNunber)
for i in range(2, int(PageNunber)+1):
time.sleep(