#一、采集详细 页面(get请求+多线程),酒店名、详细页面url、地址、星级/等级、装修年限、综合评分、四项评分
- 注意!!! :酒店所属区域(主城九区)、景区......不要用经纬度(由于不知每个区域边界的经纬度,用距离算不正确 渝中区太小 酒店景点又集中)
- 采集图中1和2,1是具体的描述用分词确定位置 2是途牛网站分的,结合1和2得最终位置
import jwdsj
import json
import threading
from scrapy import Selector
from pymongo import MongoClient
from requests import get
import time
HEADERS = {"User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36"}
MC = MongoClient("192.168.210.45")
table = MC.shuju_tuniu_jd.copy_tuniu#修改已采链接文件名******************
m=1
ii=[]
for iii in range(1,1407):#修改页数
ii.append(iii)
fenshu=int(1406/8)#修改页数
lists=[]
lists.append(ii[:fenshu])
lists.append(ii[fenshu:fenshu*2])
lists.append(ii[fenshu*2:fenshu*3])
lists.append(ii[fenshu*3:fenshu*4])
lists.append(ii[fenshu*4:fenshu*5])
lists.append(ii[fenshu*5:fenshu*6])
lists.append(ii[fenshu*6:fenshu*7])
lists.append(ii[fenshu*7:])
def xianc(list=[]):
m=0
for url_up in list[:]:
#try:
URL = 'http://hotel.tuniu.com/ajax/list?search%5Bcity%5D=300&search%5BcheckInDate%5D=2019-8-30&search%5BcheckOutDate%5D=2019-8-31&search%5BcityCode%5D=300&page={}'.format(url_up)
html = get(url=URL, headers=HEADERS).json()
hh = html["data"]["list"]
n=0
for json_l in range(len(hh)):
hh1 = hh[json_l]
a={'flag':0}
a["name"] = hh1["name"]#酒店名称
a['url'] = 'http://hotel.tuniu.com'+hh1["url"]#详细url
a['address'] = hh1['address']#酒店地址
hh1_1 = hh1['pos']
a['l_lng'] = hh1_1['lng'] # 酒店地址经度
a['l_lat'] = hh1_1['lat'] # 酒店地址维度
#a['l_sl'] = hh1['surroundingLandmarks']#距离式例
a['gradename'] = hh1['levelInfo']['name']#酒店等级/星级
hh2_2 = hh1['levelInfo']['title']#酒店等级/星级来源
if hh2_2[:4]=='途牛用户':
a['gradesource'] ='途牛'#等级
elif hh2_2[:4]=='国家旅游':
a['gradesource'] = hh2_2[:5]#星级
else:
a['gradesource'] = hh1['levelInfo']['title']
hh2_3 = hh1['decorateYear']#装修年限
if hh2_3 != '':
a['decorateYear'] =hh2_3
else:
a['decorateYear'] ='无'
a['startPrice'] = hh1['startPrice']#酒店起始价
a['pl_remarkScore'] = hh1['remarkScore']#酒店综合评分
hh1_2 = hh1['remarkAspect']
for services_1 in range(len(hh1_2)):
a[hh1_2[services_1]['name']] = hh1_2[services_1]['score']#酒店四项评分
a['commentnumber'] = hh1['remarkCount'] # 酒店评论数量
table.insert_one(a)#存入MongoDB
n += 1
print('进度:{}/19 {}/1406'.format(n, m))
time.sleep(3)
# print(a)
m += 1
# except:
# print('验证')
# time.sleep(60)
threads = []
for i in range(8):
threads.append(threading.Thread(target=xianc, args=(lists[i],)))
for t in threads:
t.start()
MC.close()
#二、采集价格 post 请求(这里是把所有种类的房间算平均值 其实可以再结合房间面积 可入住人数 床型等进行计算)
from pymongo import MongoClient
import time
import json
import requests
MC = MongoClient("192.168.210.45")
table = MC.shuju_tuniu_jd.tuniu0828#修改已采链接文件名******************
data = table.find({'flag':4})#MongoDB有永不超时
p=0
for k in data:
url_up = k['url']
url_0 = url_up[url_up.rfind('/') + 1:]
postUrl = 'http://hotel.tuniu.com/hotel-api/hotel/rateplan?c=%7B%22ct%22%3A20000%7D'
payloadData = {
'checkIn': "2019-09-03",
'checkOut': "2019-09-04",
'roomNum': 1,
'adultNum': 2,
'childAges': [],
'childNum': 0,
'filters': [],
'secondaryDist': {
'pValue': "",
'userType': 0
}
}
payloadData['hotelId']="{}".format(url_0)
# 请求头设置
payloadHeader = {
'Host': 'hotel.tuniu.com',
'Content-Type': 'application/json',
}
r = requests.post(postUrl, data=json.dumps(payloadData), headers=payloadHeader).json()
if 'data' in r.keys():#当{'success': False, 'errorCode': 7081002, 'msg': '价格计划查询异常'}
li = r['data']['rooms']
if li !=[]:
sl=[]
for i in li:
for ii in i['ratePlans']:
sl.append(ii['avgSalePrice']['price'])
minz = sorted(sl)[0]
maxz = sorted(sl)[-1]
all=0
n=0
for j in sl:
all+=j
n+=1
pinjun = all/n
a={'flag':1}
a['max']=maxz
a['min']=minz
a['average']=pinjun
if len(a)>1:
table.update({"url": url_up},{'$set': a})
p += 1
print(p)
else:
print('手动验证')
time.sleep(10)
else:
a = {'flag': 1}
a['max'] = "无"
a['min'] = "无"
a['average'] = "无"
table.update({"url": url_up}, {'$set': a})
else:
a = {'flag': 1}
a['max'] = "无"
a['min'] = "无"
a['average'] = "无"
table.update({"url": url_up}, {'$set': a})
MC.close()
#三、途牛用的经纬度来自 百度地图(跳过)
- 景点的经纬度 可以在列表页面得到,但途牛仅列举了部分景点 且渝中区的部分景点 (洪崖洞 解放碑)离得太近了,计算景点酒店时有问题
import jwdsj
import json
from urllib.request import Request, urlopen
from pymongo import MongoClient
import time
HEADERS = {"User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36"}
MC = MongoClient("192.168.210.45")
table = MC.shuju_tuniu_jd.tuniu0827#修改已采链接文件名******************
m=1
URL = 'http://hotel.tuniu.com/ajax/filterList'
req = Request(url=URL, headers=HEADERS)
html = urlopen(req).read().decode('GBK')
hh0 = json.loads(html)
hh = hh0["data"]["filterList"][1]['pros']
for hh_0_0 in range(len(hh)):
a={}
hh_1=hh[hh_0_0]['name']#位置分类
hh_2= hh[hh_0_0]['pros']
for hh_0_3 in range(len(hh_2)):
a['位置分类'] =hh_1
hh_3=hh_2[hh_0_3]
a['xuanxiang'] = hh_3 ['name']
a['lng'] = hh_3['lng']
a['lat'] = hh_3['lat']
print(a)
#四、评论 get请求(这里仅涉及 一般和待改善 部分)
import useragent
from requests import get
import requests
import threading
from scrapy import Selector
from pymongo import MongoClient
import time
MC = MongoClient("192.168.210.45")
table = MC.shuju_tuniu_jd.tuniu0828#修改已采链接文件名******************
data = table.find({"flag":4,'general_wb': ''},{"_id":0,"address":0})#MongoDB有永不超时
HEADERS = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36'}
m=0
ii1=[]
for k in data:
kk={}
kk['generalCount']=k['generalCount']
kk['negativeCount']=k['negativeCount']
kk['url']=k["url"]
ii1.append(kk)
ii=ii1
# ii=ii1[:int(len(ii1)/3)]#余
# ii=ii1[int(len(ii1)/3):int(len(ii1)/3)*2]
# ii=ii1[int(len(ii1)/3)*2:]
print(len(ii))
fenshu = int(len(ii) / 8) # 修改页数
lists = []
lists.append(ii[:fenshu])
lists.append(ii[fenshu:fenshu*2])
lists.append(ii[fenshu*2:fenshu*3])
lists.append(ii[fenshu*3:fenshu*4])
lists.append(ii[fenshu*4:fenshu*5])
lists.append(ii[fenshu*5:fenshu*6])
lists.append(ii[fenshu*6:fenshu*7])
lists.append(ii[fenshu*7:])
def xianc(list=[]):
m = 0
q=0
o=0
for kks in list[:]:
try:
url_up=kks['url']
url_0 = url_up[url_up.rfind('/')+1:]#取’http://hotel.tuniu.com/detail/1567448166'中’/'后的内容
b={'flag': 4}
a={}
shuliang1 = int((int(kks['generalCount'])+19-1)/19)
shuliang2 = int((int(kks['negativeCount'])+19-1)/19)
time.sleep(0.2)
#一般文本及时间
ssss = []
if shuliang1>0:
for j2 in range(shuliang1):
URL_PJ2 = "http://hotel.tuniu.com/ajax/remarkQuery?hotelId={}&p={}&group=2".format(url_0,j2+1)
req_PJ2 = get(url=URL_PJ2, headers=HEADERS).text
wb_html21 = Selector(text=req_PJ2).xpath('//div[@class="u5 clearfix"]/div[@class="a2"]/div[@class="b2"]/p/text()').extract()#一般评价文本
wb_html22 = Selector(text=req_PJ2).xpath('//div[@class="u5 clearfix"]/div[@class="a2"]/div[@class ="b4"]/span/text()').extract()#一般评价入住时间
for i2 in range(len(wb_html21)):
bl1= wb_html21[i2].strip()
bl2= wb_html22[i2]
boss = '%s|%s'%(bl1,bl2)#'酒店|2019-1-1'
ssss.append(boss) #添加#['酒店|2019-1-1','酒店|2019-1-1']
a['general_wb']=ssss
if shuliang2 > 0:#待改善文本及时间
ssss1 = []
for j3 in range(shuliang2):
URL_PJ3 = "http://hotel.tuniu.com/ajax/remarkQuery?hotelId={}&p={}&group=3".format(url_0,j3+1)
req_PJ3 = requests.get(url=URL_PJ3, headers=HEADERS).text
wb_html31 = Selector(text=req_PJ3).xpath('//div[@class="u5 clearfix"]/div[@class="a2"]/div[@class="b2"]/p/text()').extract()#一般评价文本
wb_html32 = Selector(text=req_PJ3).xpath('//div[@class="u5 clearfix"]/div[@class="a2"]/div[@class ="b4"]/span/text()').extract()#一般评价入住时间
for i3 in range(len(wb_html31)):
bbl1 = wb_html31[i3].strip()
bbl2 = wb_html32[i3]
bboss = '%s|%s' % (bbl1, bbl2) # '酒店|2019-1-1'
ssss1.append(bboss) # 添加#['酒店|2019-1-1','酒店|2019-1-1']
a['negative_wb'] = ssss1
time.sleep(0.2)
if 'general_wb' in a.keys():
jj0=""
for jj in a['general_wb']:
jj0+="({})".format(jj)
b['general_wb']=jj0
else:
b['general_wb']='无'
if 'negative_wb' in a.keys():
jj1=""
for jjj in a['negative_wb']:
jj1+="({})".format(jjj)
b['negative_wb'] = jj1
else:
b['negative_wb'] ='无'
if b['general_wb']!="":
table.update({"url": url_up},{'$set': b})
else:
print('cw')
m+=1
print('{}'.format(m))
time.sleep(0.5)
except Exception as e:
print("错误", e,q)
time.sleep(0.5)
q += 1
#HEADERS = {'User-Agent': useragent.myuser[q]}
#多线程
threads = []
for i in range(8):
threads.append(threading.Thread(target=xianc, args=(lists[i],)))
for t in threads:
t.start()
MC.close()
s=[]
for i in range(3):
a={}
a['jj']='slg{}'.format(i)
a['akg']=66+i
s.append(a)
print(len(s))