途牛(酒店)

#一、采集详细 页面(get请求+多线程),酒店名、详细页面url、地址、星级/等级、装修年限、综合评分、四项评分

  • 注意!!! :酒店所属区域(主城九区)、景区......不要用经纬度(由于不知每个区域边界的经纬度,用距离算不正确 渝中区太小 酒店景点又集中)
  • 采集图中1和2,1是具体的描述用分词确定位置  2是途牛网站分的,结合1和2得最终位置

import jwdsj
import json
import threading
from scrapy import Selector
from pymongo import MongoClient
from requests import get
import time
HEADERS = {"User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36"}
MC = MongoClient("192.168.210.45")
table = MC.shuju_tuniu_jd.copy_tuniu#修改已采链接文件名******************
m=1
ii=[]
for iii in range(1,1407):#修改页数
    ii.append(iii)
fenshu=int(1406/8)#修改页数
lists=[]
lists.append(ii[:fenshu])
lists.append(ii[fenshu:fenshu*2])
lists.append(ii[fenshu*2:fenshu*3])
lists.append(ii[fenshu*3:fenshu*4])
lists.append(ii[fenshu*4:fenshu*5])
lists.append(ii[fenshu*5:fenshu*6])
lists.append(ii[fenshu*6:fenshu*7])
lists.append(ii[fenshu*7:])
def xianc(list=[]):
    m=0
    for url_up in list[:]:
        #try:
            URL = 'http://hotel.tuniu.com/ajax/list?search%5Bcity%5D=300&search%5BcheckInDate%5D=2019-8-30&search%5BcheckOutDate%5D=2019-8-31&search%5BcityCode%5D=300&page={}'.format(url_up)
            html = get(url=URL, headers=HEADERS).json()
            hh = html["data"]["list"]
            n=0
            for json_l in range(len(hh)):
                hh1 = hh[json_l]
                a={'flag':0}
                a["name"] = hh1["name"]#酒店名称
                a['url'] = 'http://hotel.tuniu.com'+hh1["url"]#详细url
                a['address'] = hh1['address']#酒店地址
                hh1_1 = hh1['pos']
                a['l_lng'] = hh1_1['lng']  # 酒店地址经度
                a['l_lat'] = hh1_1['lat']  # 酒店地址维度
                #a['l_sl'] = hh1['surroundingLandmarks']#距离式例
                a['gradename'] = hh1['levelInfo']['name']#酒店等级/星级
                hh2_2 = hh1['levelInfo']['title']#酒店等级/星级来源
                if hh2_2[:4]=='途牛用户':
                    a['gradesource'] ='途牛'#等级
                elif hh2_2[:4]=='国家旅游':
                    a['gradesource'] = hh2_2[:5]#星级
                else:
                    a['gradesource'] = hh1['levelInfo']['title']
                hh2_3 =  hh1['decorateYear']#装修年限
                if hh2_3 != '':
                    a['decorateYear'] =hh2_3
                else:
                    a['decorateYear'] ='无'
                a['startPrice'] = hh1['startPrice']#酒店起始价
                a['pl_remarkScore'] = hh1['remarkScore']#酒店综合评分
                hh1_2 = hh1['remarkAspect']
                for services_1 in range(len(hh1_2)):
                    a[hh1_2[services_1]['name']] = hh1_2[services_1]['score']#酒店四项评分
                a['commentnumber'] = hh1['remarkCount']  # 酒店评论数量
                table.insert_one(a)#存入MongoDB
                n += 1
                print('进度:{}/19 {}/1406'.format(n, m))
                time.sleep(3)
                # print(a)
            m += 1
        # except:
        #     print('验证')
        #     time.sleep(60)
threads = []
for i in range(8):
    threads.append(threading.Thread(target=xianc, args=(lists[i],)))
for t in threads:
    t.start()
MC.close()

#二、采集价格 post 请求(这里是把所有种类的房间算平均值   其实可以再结合房间面积 可入住人数 床型等进行计算)

from pymongo import MongoClient
import time
import json
import requests
MC = MongoClient("192.168.210.45")
table = MC.shuju_tuniu_jd.tuniu0828#修改已采链接文件名******************
data = table.find({'flag':4})#MongoDB有永不超时
p=0
for k in data:
    url_up = k['url']
    url_0 = url_up[url_up.rfind('/') + 1:]
    postUrl = 'http://hotel.tuniu.com/hotel-api/hotel/rateplan?c=%7B%22ct%22%3A20000%7D'
    payloadData = {
        'checkIn': "2019-09-03",
        'checkOut': "2019-09-04",
        'roomNum': 1,
        'adultNum': 2,
        'childAges': [],
        'childNum': 0,
        'filters': [],
        'secondaryDist': {
            'pValue': "",
            'userType': 0
        }
    }
    payloadData['hotelId']="{}".format(url_0)
    # 请求头设置
    payloadHeader = {
        'Host': 'hotel.tuniu.com',
        'Content-Type': 'application/json',
    }
    r = requests.post(postUrl, data=json.dumps(payloadData), headers=payloadHeader).json()
    if 'data' in r.keys():#当{'success': False, 'errorCode': 7081002, 'msg': '价格计划查询异常'}
        li = r['data']['rooms']
        if li !=[]:
            sl=[]
            for i in li:
                for ii in i['ratePlans']:
                    sl.append(ii['avgSalePrice']['price'])
            minz = sorted(sl)[0]
            maxz = sorted(sl)[-1]
            all=0
            n=0
            for j in sl:
                all+=j
                n+=1
            pinjun = all/n
            a={'flag':1}
            a['max']=maxz
            a['min']=minz
            a['average']=pinjun
            if len(a)>1:
                table.update({"url": url_up},{'$set': a})
                p += 1
                print(p)
            else:
                print('手动验证')
                time.sleep(10)
        else:
            a = {'flag': 1}
            a['max'] = "无"
            a['min'] = "无"
            a['average'] = "无"
            table.update({"url": url_up}, {'$set': a})
    else:
        a = {'flag': 1}
        a['max'] = "无"
        a['min'] = "无"
        a['average'] = "无"
        table.update({"url": url_up}, {'$set': a})

MC.close()

#三、途牛用的经纬度来自 百度地图(跳过)

  • 景点的经纬度 可以在列表页面得到,但途牛仅列举了部分景点 且渝中区的部分景点 (洪崖洞 解放碑)离得太近了,计算景点酒店时有问题  
import jwdsj
import json
from urllib.request import Request, urlopen
from pymongo import MongoClient
import time
HEADERS = {"User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36"}
MC = MongoClient("192.168.210.45")
table = MC.shuju_tuniu_jd.tuniu0827#修改已采链接文件名******************
m=1
URL = 'http://hotel.tuniu.com/ajax/filterList'
req = Request(url=URL, headers=HEADERS)
html = urlopen(req).read().decode('GBK')
hh0 = json.loads(html)
hh = hh0["data"]["filterList"][1]['pros']
for hh_0_0 in range(len(hh)):
    a={}
    hh_1=hh[hh_0_0]['name']#位置分类
    hh_2= hh[hh_0_0]['pros']
    for hh_0_3 in range(len(hh_2)):
         a['位置分类'] =hh_1
         hh_3=hh_2[hh_0_3]
         a['xuanxiang'] = hh_3 ['name']
         a['lng'] = hh_3['lng']
         a['lat'] = hh_3['lat']
         print(a)

#四、评论 get请求(这里仅涉及 一般和待改善 部分)

import useragent
from requests import get
import requests
import threading
from scrapy import Selector
from pymongo import MongoClient
import time
MC = MongoClient("192.168.210.45")
table = MC.shuju_tuniu_jd.tuniu0828#修改已采链接文件名******************
data = table.find({"flag":4,'general_wb': ''},{"_id":0,"address":0})#MongoDB有永不超时
HEADERS = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36'}
m=0
ii1=[]
for k in data:
    kk={}
    kk['generalCount']=k['generalCount']
    kk['negativeCount']=k['negativeCount']
    kk['url']=k["url"]
    ii1.append(kk)
ii=ii1
# ii=ii1[:int(len(ii1)/3)]#余
# ii=ii1[int(len(ii1)/3):int(len(ii1)/3)*2]
# ii=ii1[int(len(ii1)/3)*2:]
print(len(ii))
fenshu = int(len(ii) / 8)  # 修改页数
lists = []
lists.append(ii[:fenshu])
lists.append(ii[fenshu:fenshu*2])
lists.append(ii[fenshu*2:fenshu*3])
lists.append(ii[fenshu*3:fenshu*4])
lists.append(ii[fenshu*4:fenshu*5])
lists.append(ii[fenshu*5:fenshu*6])
lists.append(ii[fenshu*6:fenshu*7])
lists.append(ii[fenshu*7:])
def xianc(list=[]):
    m = 0
    q=0
    o=0
    for kks in list[:]:
        try:
            url_up=kks['url']
            url_0 = url_up[url_up.rfind('/')+1:]#取’http://hotel.tuniu.com/detail/1567448166'中’/'后的内容
            b={'flag': 4}
            a={}
            shuliang1 = int((int(kks['generalCount'])+19-1)/19)
            shuliang2 = int((int(kks['negativeCount'])+19-1)/19)
            time.sleep(0.2)
            #一般文本及时间
            ssss = []
            if shuliang1>0:
                for j2 in range(shuliang1):
                    URL_PJ2 = "http://hotel.tuniu.com/ajax/remarkQuery?hotelId={}&p={}&group=2".format(url_0,j2+1)
                    req_PJ2 = get(url=URL_PJ2, headers=HEADERS).text
                    wb_html21 = Selector(text=req_PJ2).xpath('//div[@class="u5 clearfix"]/div[@class="a2"]/div[@class="b2"]/p/text()').extract()#一般评价文本
                    wb_html22 = Selector(text=req_PJ2).xpath('//div[@class="u5 clearfix"]/div[@class="a2"]/div[@class ="b4"]/span/text()').extract()#一般评价入住时间
                    for i2 in range(len(wb_html21)):
                        bl1= wb_html21[i2].strip()
                        bl2= wb_html22[i2]
                        boss = '%s|%s'%(bl1,bl2)#'酒店|2019-1-1'
                        ssss.append(boss) #添加#['酒店|2019-1-1','酒店|2019-1-1']
                a['general_wb']=ssss
            if shuliang2 > 0:#待改善文本及时间
                ssss1 = []
                for j3 in range(shuliang2):
                    URL_PJ3 = "http://hotel.tuniu.com/ajax/remarkQuery?hotelId={}&p={}&group=3".format(url_0,j3+1)
                    req_PJ3 = requests.get(url=URL_PJ3, headers=HEADERS).text
                    wb_html31 = Selector(text=req_PJ3).xpath('//div[@class="u5 clearfix"]/div[@class="a2"]/div[@class="b2"]/p/text()').extract()#一般评价文本
                    wb_html32 = Selector(text=req_PJ3).xpath('//div[@class="u5 clearfix"]/div[@class="a2"]/div[@class ="b4"]/span/text()').extract()#一般评价入住时间
                    for i3 in range(len(wb_html31)):
                        bbl1 = wb_html31[i3].strip()
                        bbl2 = wb_html32[i3]
                        bboss = '%s|%s' % (bbl1, bbl2)  # '酒店|2019-1-1'
                        ssss1.append(bboss)  # 添加#['酒店|2019-1-1','酒店|2019-1-1']
                    a['negative_wb'] = ssss1
            time.sleep(0.2)
            if 'general_wb' in a.keys():
                jj0=""
                for jj in a['general_wb']:
                    jj0+="({})".format(jj)
                b['general_wb']=jj0
            else:
                b['general_wb']='无'
            if 'negative_wb' in a.keys():
                jj1=""
                for jjj in a['negative_wb']:
                    jj1+="({})".format(jjj)
                b['negative_wb'] = jj1
            else:
                b['negative_wb'] ='无'
            if b['general_wb']!="":
                table.update({"url": url_up},{'$set': b})
            else:
                print('cw')
            m+=1
            print('{}'.format(m))
            time.sleep(0.5)
        except Exception as e:
            print("错误", e,q)
            time.sleep(0.5)
            q += 1
            #HEADERS = {'User-Agent': useragent.myuser[q]}
#多线程
threads = []
for i in range(8):
    threads.append(threading.Thread(target=xianc, args=(lists[i],)))
for t in threads:
    t.start()
MC.close()
s=[]
for i in range(3):
    a={}
    a['jj']='slg{}'.format(i)
    a['akg']=66+i
    s.append(a)
print(len(s))

 

 

 

 

评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值