爬取酒店的评论
使用的库
import urllib2
import requests
import re
import time
import json
- 通过抓包发现酒店的id在asyncsearch里
是post的请求
- 设置post参数和head的值
headers={'Accept':'application/json, text/javascript, */*; q=0.01',
'Accept-Encoding':'gzip, deflate',
'Accept-Language' :'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',
'Connection': 'keep-alive',
'Cookie' :'CookieGuid=158948cd-563a-4211-8a3e-a92bd263cba0; page_time=1509628060819%2C1509628152402%2C1509628219681%2C1509693907739%2C1509693938986%2C1509693967862%2C1509701358453%2C1509701412108%2C1509764088450%2C1509764288344%2C1509772980043%2C1509773457586%2C1509773753700%2C1509774349211%2C1509774650832%2C1509775873404%2C1509775901516%2C1509776293716%2C1509776386781%2C1509776424331%2C1509776587625%2C1509776889277%2C1509778200633%2C1509779461446; _RF1=111.225.131.187; _RSG=zs4ixjEH93BfifEVp1.6NB; _RDG=28df89f13e8f75279825ba18abc6dd550e; _RGUID=b0c944eb-7bb0-4dbd-b5c6-f81cdf3a9b29; ShHotel=CityID=0101&CityNameCN=%E5%8C%97%E4%BA%AC%E5%B8%82&CityName=%E5%8C%97%E4%BA%AC%E5%B8%82&OutDate=2017-11-06&CityNameEN=beijing&InDate=2017-11-05; _fid=j9jp7iwv-bf9b-4c1d-8c4f-893b780f205c; newjava1=a79d58d364ea53c8c9161ec3b2f45c8d; JSESSIONID=EB741B472421B3E455397D577D009DE8; SessionGuid=6a99e379-6ae4-4a74-b0a4-f94dfe60ab5a; Esid=1ec4caeb-c805-4d96-8ddf-43a6a23eed52; com.eLong.CommonService.OrderFromCookieInfo=Status=1&Orderfromtype=1&Isusefparam=0&Pkid=50&Parentid=50000&Coefficient=0.0&Makecomefrom=0&Cookiesdays=0&Savecookies=0&Priority=8000; fv=pcweb; s_cc=true; s_sq=%5B%5BB%5D%5D; s_visit=1',
'Host' :'hotel.elong.com',
'Upgrade-Insecure-Requests':'1',
'User-Agent':'Mozilla/5.0 (Windows NT 6.3; WOW64; rv:56.0) Gecko/20100101 Firefox/56.0',
'Content-Type':'application/x-www-form-urlencoded; charset=UTF-8',
'X-Requested-With':'XMLHttpRequest',
'Content-Length':'1062'
}
data={'listRequest.pageIndex':k,
'listRequest.areaID':'',
}
r = requests.post(url,data=data, headers=headers)
- 利用json提出酒店的id:
a=json.loads(r.text)
b=a['value']['hotelIds']
- 每个酒店的url就是:http://www.elong.com/handan/+酒店的id这样就获得了每个酒店的url
- 抓包发现酒店评论在
http://www.elong.com/ajax/detail/gethotelreviews/?hotelId=90060790&recommendedType=0&pageIndex=1&mainTagId=0&subTagId=0&code=7052955&elongToken=j9ihkh4h-94d4-4e39-9bda-8b96cd4fcf7f&ctripToken=b0c944eb-7bb0-4dbd-b5c6-f81cdf3a9b29&_=1513509733755
利用json获得评论
def getpinglun(href):
for i in range(1,20):
url='http://hotel.elong.com/ajax/detail/gethotelreviews/?hotelId='+str(href)+'&recommendedType=0&pageIndex='+str(i)+'&mainTagId=0&subTagId=0&code=7912561&elongToken=j9cq7p2u-5f25-4d49-83e6-cb143cc52d2b&ctripToken=85cd1013-f8a8-4536-8d88-20dab6e854b6&_=1509280969224'
print url
request=urllib2.Request(url)
response=urllib2.urlopen(request)
html =response.read()
b=json.loads(html)
if b['contents']:
for k in b['contents']:
print k["createTimeString"].encode('utf-8')
f.write(k["createTimeString"].encode('utf-8')+'\n')
print k["content"].encode('utf-8')
f.write(k["content"].encode('utf-8')+'\n')
time.sleep(0.1)
- 利用bs4获得酒店信息
def gethotel(href):
url='http://hotel.elong.com'+'/'+str(href)+'/'
print url
request=urllib2.Request(url)
response=urllib2.urlopen(request)
html =response.read()
soup=BeautifulSoup(html)
text=soup.find_all('div',class_='dview_info')
for k in text:
a=k.get_text()
print a
f.write(a.encode('utf-8'))
整体代码实现:
#--*--coding:utf-8--*--
import urllib2
import requests
import re
import time
import json
from bs4 import BeautifulSoup
import sys
reload(sys)
sys.setdefaultencoding('utf8')
def gethotel(href):
url='http://hotel.elong.com'+'/'+str(href)+'/'
print url
request=urllib2.Request(url)
response=urllib2.urlopen(request)
html =response.read()
soup=BeautifulSoup(html)
text=soup.find_all('div',class_='dview_info')
for k in text:
a=k.get_text()
print a
f.write(a.encode('utf-8'))
def getpinglun(href):
for i in range(1,20):
url='http://hotel.elong.com/ajax/detail/gethotelreviews/?hotelId='+str(href)+'&recommendedType=0&pageIndex='+str(i)+'&mainTagId=0&subTagId=0&code=7912561&elongToken=j9cq7p2u-5f25-4d49-83e6-cb143cc52d2b&ctripToken=85cd1013-f8a8-4536-8d88-20dab6e854b6&_=1509280969224'
print url
request=urllib2.Request(url)
response=urllib2.urlopen(request)
html =response.read()
b=json.loads(html)
if b['contents']:
for k in b['contents']:
print k["createTimeString"].encode('utf-8')
f.write(k["createTimeString"].encode('utf-8')+'\n')
print k["content"].encode('utf-8')
f.write(k["content"].encode('utf-8')+'\n')
time.sleep(0.1)
def geturl():
url='http://hotel.elong.com/ajax/list/asyncsearch'
headers={'Accept':'application/json, text/javascript, */*; q=0.01',
'Accept-Encoding':'gzip, deflate',
'Accept-Language' :'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',
'Connection': 'keep-alive',
'Cookie' :'CookieGuid=158948cd-563a-4211-8a3e-a92bd263cba0; page_time=1509628060819%2C1509628152402%2C1509628219681%2C1509693907739%2C1509693938986%2C1509693967862%2C1509701358453%2C1509701412108%2C1509764088450%2C1509764288344%2C1509772980043%2C1509773457586%2C1509773753700%2C1509774349211%2C1509774650832%2C1509775873404%2C1509775901516%2C1509776293716%2C1509776386781%2C1509776424331%2C1509776587625%2C1509776889277%2C1509778200633%2C1509779461446; _RF1=111.225.131.187; _RSG=zs4ixjEH93BfifEVp1.6NB; _RDG=28df89f13e8f75279825ba18abc6dd550e; _RGUID=b0c944eb-7bb0-4dbd-b5c6-f81cdf3a9b29; ShHotel=CityID=0101&CityNameCN=%E5%8C%97%E4%BA%AC%E5%B8%82&CityName=%E5%8C%97%E4%BA%AC%E5%B8%82&OutDate=2017-11-06&CityNameEN=beijing&InDate=2017-11-05; _fid=j9jp7iwv-bf9b-4c1d-8c4f-893b780f205c; newjava1=a79d58d364ea53c8c9161ec3b2f45c8d; JSESSIONID=EB741B472421B3E455397D577D009DE8; SessionGuid=6a99e379-6ae4-4a74-b0a4-f94dfe60ab5a; Esid=1ec4caeb-c805-4d96-8ddf-43a6a23eed52; com.eLong.CommonService.OrderFromCookieInfo=Status=1&Orderfromtype=1&Isusefparam=0&Pkid=50&Parentid=50000&Coefficient=0.0&Makecomefrom=0&Cookiesdays=0&Savecookies=0&Priority=8000; fv=pcweb; s_cc=true; s_sq=%5B%5BB%5D%5D; s_visit=1',
'Host' :'hotel.elong.com',
'Upgrade-Insecure-Requests':'1',
'User-Agent':'Mozilla/5.0 (Windows NT 6.3; WOW64; rv:56.0) Gecko/20100101 Firefox/56.0',
'Content-Type':'application/x-www-form-urlencoded; charset=UTF-8',
'X-Requested-With':'XMLHttpRequest',
'Content-Length':'1062'
}
a2=[]
for k in range(1,20):
data={'listRequest.pageIndex':k,
'listRequest.areaID':'',
}
r = requests.post(url,data=data, headers=headers)
a=json.loads(r.text)
b=a['value']['hotelIds']
b=b.split(',')
a2.append(b)
return a2
url=geturl()
k=1
for href in url:
for a in href:
f = open('01\%d.txt'%k,'a')
gethotel(a)
getpinglun(a)
f.close()
time.sleep(0.1)
k=k+1