多线程爬取携程网酒店数据,星级数据和经纬度

下面展示一些 内联代码片

多线程爬取携程网酒店星级数据和经纬度
CITY_CODE={7551:"商洛",7587:"吴忠市",3222:"平顶山市",1541:"白银市",1838:"阿坝藏族羌族自治州",4124:"甘孜藏族自治州",7631:"吕梁市",1820:"滨州市",21862:"果洛",7807:"海北藏族自治州",7752:"海东市",7794:"海南藏族自治州",7589:"海西蒙古族藏族自治州",7802:"黄南藏族自治州",21114:"玉树",4216:"石嘴山市",3221:"周口市",21654:"甘南藏族自治州",21892:"临夏",7707:"陇南市",3887:"巴彦淖尔市",3976:"鄂尔多斯市",4255:"呼伦贝尔市",7518:"乌兰察布市",21269:"阿拉善盟",21021:"兴安盟",7576:"锡林郭勒盟",7:"青岛",10:"西安",99:"银川",100:"兰州",103:"呼和浩特",105:"太原",110:"延安",111:"咸阳",112:"宝鸡",118:"铜川",124:"西宁",136:"大同",137:"长治",139:"临汾",140:"运城",141:"包头",144:"济南",181:"安阳",202:"赤峰",236:"东营",318:"济宁",321:"固原",331:"开封",350:"洛阳",385:"南阳",388:"平凉",404:"庆阳",436:"三门峡",441:"商丘",454:"泰安",458:"通辽",464:"天水",475:"潍坊",479:"威海",507:"新乡",510:"信阳",513:"忻州",527:"榆林",533:"烟台",542:"淄博",551:"驻马店",556:"中卫",559:"郑州",569:"临沂",614:"枣庄",664:"武威",907:"阳泉",951:"鹤壁",1021:"定西",1030:"渭南",1071:"聊城",1074:"菏泽",1088:"漯河",1092:"晋城",1093:"焦作",1094:"许昌",1106:"日照",1133:"乌海",1232:"濮阳",1317:"朔州",1370:"德州",1453:"晋中",1454:"济源"}

import threading
from queue import Queue
import config
import urllib
import  re
import pandas as pd
num_of_threads=10
def write_fun(line):
    with open('酒店0502.csv','a',encoding='utf-8') as f:
        f.write(line)
        f.close()


def load_data_from_dict(o, *keys):
    oo = o
    for i, key in enumerate(keys):
        if not oo:
            return None
        if i == (len(keys) - 1):
            return oo.get(key) if isinstance(oo, dict) else None
        oo = oo.get(key) if isinstance(oo, dict) else oo
import  json

#西安市 10
#洛阳 350
#青岛市 7
#宝鸡市 112
#晋中市   1453
#济南市  144
#烟台市  533
#泰安市  454
#临沂市  569
#潍坊市  475
#太原市  105
#济宁市 318
#运城市  140
# 兰州市   100
#开封市 331
#大同市 136
#安阳市 181
#淄博市 542
#南阳市 385
#渭南市 1030
# 阳泉市 907
# 临汾市 139
# 延安市   110
#焦作市 1093
# 平顶山市 3222
#新乡市 507
#三门峡市 436
# 驻马店市 551
# 周口市 3221
#信阳市 510
#鹤壁市 951
#许昌市  1094
#商丘市 441
# 濮阳市 1232
#漯河市 1088



class myThread (threading.Thread) :
    def __init__(self, threadID, city_queue) :
        threading.Thread.__init__ (self)
        self.threadID = threadID
        self.city_queue = city_queue
        self.singal = threading.Event ()
        self.singal.set ()

    def run(self) :
        while not self.city_queue.empty () :
            code = self.city_queue.get ()
            print (code)
            ci = str (config.CITY_CODE.get (code))
            print (ci)
            main (ci,code)
            # namelist = []
            # with open ('/Users/mac/Downloads/ctrip_spider-master/酒店0501.csv', 'r', encoding='utf-8') as f :
            #     for line in f :
            #         namelist.append (line.split (',')[0])
            # if ci not in  set (namelist) :
            #     print(ci)
            #     main (ci, code)


            # if ci =='广东':
            #     print (ci + "要找的城市")
            #     mian(code,ci)
            #     break

def main(putname,code):
    name =urllib.parse.quote(putname)
    baseUrl = 'https://you.ctrip.com/searchsite/?query={}'.format(name)
    pagedata1 = urllib.request.urlopen (baseUrl).read ().decode ("utf-8", "ignore")
    citypattern = '<dt><a href="(.*?)" target="_blank">(.*?)</a>'
    citys = re.compile (citypattern, re.S).findall (pagedata1)
    rel_cityurl =''
    rel_ityname=''
    for ct in citys:
        if "sight" in ct[0]:
            cityurl=ct[0]
            cityname=ct[1]
            if 'sight' in cityurl:
                citypatternsig = '/sight/(.*?)/.*?'
                city = re.compile (citypatternsig, re.S).findall (cityurl)
                cityurl=city[0]
                rel_cityurl=cityurl
                rel_ityname=cityname
    print(rel_cityurl)
    print(rel_ityname)
    rel_citynum= re.findall (r"(\d+)", rel_cityurl)
    rel_cityname = re.findall (r"(\D+)", rel_cityurl)
    print(rel_citynum[0])
    print(rel_cityname[0])
    for i  in  range(1,1000):
        print ("********开始第{}页面********".format (i))
        try:
            #url ='https://hotels.ctrip.com/hotels/listPage?&city={0}&optionId={0}&optionType=City&checkin=2021/05/10&checkout=2021/05/11&pageNo={2}'.format(7551,i)
            url ='https://hotels.ctrip.com/hotels/listPage?cityename={1}&city={0}&optionId={0}&optionType=City&checkin=2021/05/10&checkout=2021/05/11&pageNo={2}'.format(code,rel_cityname[0],i)
            pagedata= urllib.request.urlopen (url).read ().decode ("utf-8", "ignore")
            jsonpattern = 'window.IBU_HOTEL=(.*?);\n    __webpack_public_path__='
            datajson = re.compile (jsonpattern, re.S).findall (pagedata)
            hoteljson=json.loads (datajson[0])
            initData= load_data_from_dict (hoteljson, 'initData')
            firstPageList=load_data_from_dict(initData,'firstPageList')
            hotelList=load_data_from_dict(firstPageList,'hotelList')
            hoteldetails = load_data_from_dict (hotelList, 'list')
            for hoteldetail in hoteldetails:
                base=load_data_from_dict (hoteldetail, 'base')
                star=load_data_from_dict (base, 'star')
                hotelName=load_data_from_dict (base, 'hotelName')
                position=load_data_from_dict (hoteldetail, 'position')
                address=load_data_from_dict (position, 'address')
                cityName=load_data_from_dict (position, 'cityName')
                lat=load_data_from_dict (position, 'lat')
                lng = load_data_from_dict (position, 'lng')
                line = str(putname)+ ',' +str (cityName) + ',' + str (hotelName) + ',' + str (address) + ',' + str (star) + ',' + str (lat) + ','+str(lng)+'\n'
                print(line)
                write_fun (line)
        except Exception as e:
            continue



       

if __name__ == "__main__" :
    city_queue = Queue ()
    citys = config.CITY_CODE
    for i in citys :
        city_queue.put (i)
        threads = [myThread (i, city_queue) for i in range (num_of_threads)]
        for i in range (num_of_threads) :
            threads[i].start ()

#    main ('西安市', 10)

# # 开启多线程模式
# thread = []
# for i in range (0, total, range_count) :
#     # i, range_count SQL 查询起始位置,查询数量
#     t = threading.Thread (target=main,
#                           args=(i, range_count))
#     thread.append (t)
#
# for i in range (0, len (thread)) :
#     thread[i].start ()
#
# for i in range (0, len (thread)) :
#     thread[i].join ()











  • 0
    点赞
  • 4
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值