对携程酒店用户评价爬取
(1)直接贴代码,后面逐步分析
import re
import urllib.request, urllib.error
import xlwt
import sqlite3
from bs4 import BeautifulSoup
def main():
# 目标网址
BaseUrl = "https://you.ctrip.com/sight/qingyang2613/126392.html?scene=online"
# 1.爬取网页
datalist = getDate(BaseUrl)
savaPath = "D:/九华山.xls"
# 2.保存数据
SavaPath(datalist, savaPath)
# 用户名
findTitle = re.compile(r'<div class="userName">(.*?)</div>')
# 分数
findLevel = re.compile(r'<span class="averageScore">[\s\S]*?>([0-9]+)', re.S)
# 时间
findTime = re.compile(r'<div class="commentTime">(.*?)</div>')
# 评价
findAssess = re.compile(r'<div class="commentDetail">(.*?)</div>', re.S)
# 获取网页
def getDate(BaseUrl):
datalist = []
html = AskURL(BaseUrl)
soup = BeautifulSoup(html, "html.parser")
# print(soup)
# 逐一解析
for item in soup.find_all('div', class_="commentItem"):
data = [] # 此列表用于保存一条评价的全部信息
item = str(item)
Title = re.findall(findTitle, item)
data.append(Title)
Level = re.findall(findLevel, item)
data.append(Level)
Time = re.findall(findTime, item)
data.append(Time)
Assess = re.findall(findAssess, item)
data.append(Assess)
datalist.append(data)
return datalist
# 获得一个指定的url网页内容
def AskURL(url):
# 模拟浏览器头部,像携程发送请求
head = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
}
# head={"user-agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Mobile Safari/537.36 Edg/90.0.818.62"}
# 用户代理,告诉服务器,我们是什么类型的机器,浏览器(本质是让它返回可以接受的文件内容)
request = urllib.request.Request(url, headers=head)
html = ""
try:
response = urllib.request.urlopen(request)
html = response.read().decode("utf-8")
# print(html)
except urllib.error.URLError as e:
if hasattr(e, "code"):
print(e.code)
if hasattr(e, "reason"):
print(e.reason)
return html
# 3.保存数据
def SavaPath(datalist, savaPath):
print("save....")
book = xlwt.Workbook(encoding="utf-8", style_compression=0) # 创建Workbook对象
sheet = book.add_sheet('携程酒店评价', cell_overwrite_ok=True) # 创建工作表
col = ("用户名", "分数", "时间", "评价")
for i in range(0, 4):
sheet.write(0, i, col[i])
for i in range(0, len(datalist)):
print("第%d条" % (i + 1))
data = datalist[i]
for j in range(0, 4):
sheet.write(i + 1, j, data[j])
book.save(savaPath)
# main()
if __name__ == '__main__': # 当程序执行时,调用函数
main()
(2)如果直接爬取携程官网是没有评价的,评价是后来载入的
点开这个网页,就是我想要的评价了。
(3)代码分析
#获得一个指定的url网页内容
def AskURL(url):
#模拟浏览器头部,像豆瓣发送请求
head={"user-agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Mobile Safari/537.36 Edg/90.0.818.62"}
#用户代理,告诉服务器,我们是什么类型的机器,浏览器(本质是让它返回可以接受的文件内容)
request=urllib.request.Request(url,headers=head)
html=""
try:
response=urllib.request.urlopen(request)
html=response.read().decode("utf-8")
# print(html)
except urllib.error.URLError as e:
if hasattr(e,"code"):
print(e.code)
if hasattr(e,"reason"):
print(e.reason)
return html
这是获得网页,会获取到网页,这时候就要用正则表达式了
#用户名
findTitle=re.compile(r'<span>(.*?)</span>')
#等级
findLevel=re.compile(r'<span class="user-level-base user-level-1>(.*?)</span>',re.S)
#入住时间
findTime=re.compile(r'<span class="checkin-line">(.*?)</span>')
#入住类型
findType=re.compile(r'<em class="item hotel-border">(.*?)</em>')
#入住房间
findHotel=re.compile(r' <span class="hotel-arr" data-ubt-key="c_hotel_comment_baseroom" style="color:#4289ff;">(.*?)</span>')
#评价
findAssess=re.compile(r'<p class="tree-ellips-line6 comment-swarp">(.*?)</p>',re.S)
#回复
findResponse=re.compile(r' <li style="display:none;">(.*?)</li>',re.S)
获取到的网页我是保存在excel表格里面的。

(4)由于携程是动态载入,所以一次只能爬取10条。关于动态爬取我放在下一篇。