post请求爬取艺龙酒店的评论

爬取酒店的评论

使用的库

import urllib2
import requests
import re
import time
import json

  • 通过抓包发现酒店的id在asyncsearch里
网址:http://www.elong.com/ajax/list/asyncsearch

是post的请求

  • 设置post参数和head的值

headers={'Accept':'application/json, text/javascript, */*; q=0.01',
            'Accept-Encoding':'gzip, deflate',
            'Accept-Language'	:'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',
            'Connection':	'keep-alive',
            'Cookie'	:'CookieGuid=158948cd-563a-4211-8a3e-a92bd263cba0; page_time=1509628060819%2C1509628152402%2C1509628219681%2C1509693907739%2C1509693938986%2C1509693967862%2C1509701358453%2C1509701412108%2C1509764088450%2C1509764288344%2C1509772980043%2C1509773457586%2C1509773753700%2C1509774349211%2C1509774650832%2C1509775873404%2C1509775901516%2C1509776293716%2C1509776386781%2C1509776424331%2C1509776587625%2C1509776889277%2C1509778200633%2C1509779461446; _RF1=111.225.131.187; _RSG=zs4ixjEH93BfifEVp1.6NB; _RDG=28df89f13e8f75279825ba18abc6dd550e; _RGUID=b0c944eb-7bb0-4dbd-b5c6-f81cdf3a9b29; ShHotel=CityID=0101&CityNameCN=%E5%8C%97%E4%BA%AC%E5%B8%82&CityName=%E5%8C%97%E4%BA%AC%E5%B8%82&OutDate=2017-11-06&CityNameEN=beijing&InDate=2017-11-05; _fid=j9jp7iwv-bf9b-4c1d-8c4f-893b780f205c; newjava1=a79d58d364ea53c8c9161ec3b2f45c8d; JSESSIONID=EB741B472421B3E455397D577D009DE8; SessionGuid=6a99e379-6ae4-4a74-b0a4-f94dfe60ab5a; Esid=1ec4caeb-c805-4d96-8ddf-43a6a23eed52; com.eLong.CommonService.OrderFromCookieInfo=Status=1&Orderfromtype=1&Isusefparam=0&Pkid=50&Parentid=50000&Coefficient=0.0&Makecomefrom=0&Cookiesdays=0&Savecookies=0&Priority=8000; fv=pcweb; s_cc=true; s_sq=%5B%5BB%5D%5D; s_visit=1',
            'Host'	:'hotel.elong.com',
            'Upgrade-Insecure-Requests':'1',
           'User-Agent':'Mozilla/5.0 (Windows NT 6.3; WOW64; rv:56.0) Gecko/20100101 Firefox/56.0',
         'Content-Type':'application/x-www-form-urlencoded; charset=UTF-8',
            'X-Requested-With':'XMLHttpRequest',
            'Content-Length':'1062'
                    }
data={'listRequest.pageIndex':k,
            'listRequest.areaID':'',
          }
r = requests.post(url,data=data, headers=headers)

  • 利用json提出酒店的id:

 a=json.loads(r.text)
 b=a['value']['hotelIds']

  •  每个酒店的url就是:http://www.elong.com/handan/+酒店的id这样就获得了每个酒店的url
  •  抓包发现酒店评论在

http://www.elong.com/ajax/detail/gethotelreviews/?hotelId=90060790&recommendedType=0&pageIndex=1&mainTagId=0&subTagId=0&code=7052955&elongToken=j9ihkh4h-94d4-4e39-9bda-8b96cd4fcf7f&ctripToken=b0c944eb-7bb0-4dbd-b5c6-f81cdf3a9b29&_=1513509733755

利用json获得评论

def getpinglun(href):
    for i in range(1,20):
        url='http://hotel.elong.com/ajax/detail/gethotelreviews/?hotelId='+str(href)+'&recommendedType=0&pageIndex='+str(i)+'&mainTagId=0&subTagId=0&code=7912561&elongToken=j9cq7p2u-5f25-4d49-83e6-cb143cc52d2b&ctripToken=85cd1013-f8a8-4536-8d88-20dab6e854b6&_=1509280969224'
        print url

        request=urllib2.Request(url)
        response=urllib2.urlopen(request)
        html =response.read()
        b=json.loads(html)

        if b['contents']:
            for k in b['contents']:
                print k["createTimeString"].encode('utf-8')
                f.write(k["createTimeString"].encode('utf-8')+'\n')
                print k["content"].encode('utf-8')
                f.write(k["content"].encode('utf-8')+'\n')
                time.sleep(0.1)

  • 利用bs4获得酒店信息

def gethotel(href):
    url='http://hotel.elong.com'+'/'+str(href)+'/'
    print url
    request=urllib2.Request(url)
    response=urllib2.urlopen(request)
    html =response.read()
    soup=BeautifulSoup(html)
    text=soup.find_all('div',class_='dview_info')
    for k in text:
            a=k.get_text()
            print a
            f.write(a.encode('utf-8'))

整体代码实现:

#--*--coding:utf-8--*--
import urllib2
import requests
import re
import time
import json
from bs4 import BeautifulSoup
import sys
reload(sys)
sys.setdefaultencoding('utf8')
def gethotel(href):
    url='http://hotel.elong.com'+'/'+str(href)+'/'
    print url
    request=urllib2.Request(url)
    response=urllib2.urlopen(request)
    html =response.read()
    soup=BeautifulSoup(html)
    text=soup.find_all('div',class_='dview_info')
    for k in text:
            a=k.get_text()
            print a
            f.write(a.encode('utf-8'))
def getpinglun(href):
    for i in range(1,20):
        url='http://hotel.elong.com/ajax/detail/gethotelreviews/?hotelId='+str(href)+'&recommendedType=0&pageIndex='+str(i)+'&mainTagId=0&subTagId=0&code=7912561&elongToken=j9cq7p2u-5f25-4d49-83e6-cb143cc52d2b&ctripToken=85cd1013-f8a8-4536-8d88-20dab6e854b6&_=1509280969224'
        print url

        request=urllib2.Request(url)
        response=urllib2.urlopen(request)
        html =response.read()
        b=json.loads(html)

        if b['contents']:
            for k in b['contents']:
                print k["createTimeString"].encode('utf-8')
                f.write(k["createTimeString"].encode('utf-8')+'\n')
                print k["content"].encode('utf-8')
                f.write(k["content"].encode('utf-8')+'\n')
                time.sleep(0.1)
def geturl():
    url='http://hotel.elong.com/ajax/list/asyncsearch'
    headers={'Accept':'application/json, text/javascript, */*; q=0.01',
            'Accept-Encoding':'gzip, deflate',
            'Accept-Language'	:'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',
            'Connection':	'keep-alive',
            'Cookie'	:'CookieGuid=158948cd-563a-4211-8a3e-a92bd263cba0; page_time=1509628060819%2C1509628152402%2C1509628219681%2C1509693907739%2C1509693938986%2C1509693967862%2C1509701358453%2C1509701412108%2C1509764088450%2C1509764288344%2C1509772980043%2C1509773457586%2C1509773753700%2C1509774349211%2C1509774650832%2C1509775873404%2C1509775901516%2C1509776293716%2C1509776386781%2C1509776424331%2C1509776587625%2C1509776889277%2C1509778200633%2C1509779461446; _RF1=111.225.131.187; _RSG=zs4ixjEH93BfifEVp1.6NB; _RDG=28df89f13e8f75279825ba18abc6dd550e; _RGUID=b0c944eb-7bb0-4dbd-b5c6-f81cdf3a9b29; ShHotel=CityID=0101&CityNameCN=%E5%8C%97%E4%BA%AC%E5%B8%82&CityName=%E5%8C%97%E4%BA%AC%E5%B8%82&OutDate=2017-11-06&CityNameEN=beijing&InDate=2017-11-05; _fid=j9jp7iwv-bf9b-4c1d-8c4f-893b780f205c; newjava1=a79d58d364ea53c8c9161ec3b2f45c8d; JSESSIONID=EB741B472421B3E455397D577D009DE8; SessionGuid=6a99e379-6ae4-4a74-b0a4-f94dfe60ab5a; Esid=1ec4caeb-c805-4d96-8ddf-43a6a23eed52; com.eLong.CommonService.OrderFromCookieInfo=Status=1&Orderfromtype=1&Isusefparam=0&Pkid=50&Parentid=50000&Coefficient=0.0&Makecomefrom=0&Cookiesdays=0&Savecookies=0&Priority=8000; fv=pcweb; s_cc=true; s_sq=%5B%5BB%5D%5D; s_visit=1',
            'Host'	:'hotel.elong.com',
            'Upgrade-Insecure-Requests':'1',
           'User-Agent':'Mozilla/5.0 (Windows NT 6.3; WOW64; rv:56.0) Gecko/20100101 Firefox/56.0',
         'Content-Type':'application/x-www-form-urlencoded; charset=UTF-8',
            'X-Requested-With':'XMLHttpRequest',
            'Content-Length':'1062'
                    }
    a2=[]
    for k in range(1,20):
        data={'listRequest.pageIndex':k,
            'listRequest.areaID':'',
          }
        r = requests.post(url,data=data, headers=headers)
        a=json.loads(r.text)
        b=a['value']['hotelIds']
        b=b.split(',')
        a2.append(b)
    return a2
url=geturl()
k=1

for href in url:
    for a in href:
        f = open('01\%d.txt'%k,'a')
        gethotel(a)
        getpinglun(a)
        f.close()
        time.sleep(0.1)
        k=k+1


  • 1
    点赞
  • 6
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值