创新项目实训(十二)

创新项目实训(十二)

前言

我们组打算搭建一个国内旅游比价网站,
而我负责的部份是各大订酒店网站的数据获取及整理

主要参考版上的经验分享+自己的修改理解
小白0经验入门记录、边爬边学习ing
有错误或更好的建议都可以指教讨论

异步协程爬虫

以马蜂窝为例

	#爬取单个页面
    async def one_page(self,url):
      async with aiohttp.ClientSession(headers=self.headers) as session:
         async with session.get(url) as res:
            text = await res.text()
            data = json.loads(text)
            page = jsonpath(data,'$.html')[0]
            selector = Selector(page)
            name = selector.xpath('//*[@class="hotel-item clearfix _j_hotel_item"]/@data-name').extract()
            a = selector.xpath('//*[@class="hotel-item clearfix _j_hotel_item"]/@data-id').extract()
            self.id_data.extend(a)
            self.name_data.extend(name)

	 def parse_page(self):
        total = math.ceil((self.get_total_item())/20)
        print("共{}页".format(total))
        a =self.code
        loop = asyncio.get_event_loop()
        tasks = []
        for i in range(1,total,1):
            t = str(int(time.time() * 1000))
            room = '{"_ts":"' + t + '","has_booking_rooms":"0","has_faved":"0","iAdultsNum":"2","iAreaId":"-1","iChildrenNum":"0","iDistance":"10000","iMddId":"'+str(self.code)+'","iPage":"'+str(i)+'","iPoiId":"","iPriceMax":"","iPriceMin":"","iRegionId":"-1","nLat":"0","nLng":"0","position_name":"","sAction":"getPoiList5","sCheckIn":"2021-07-09","sCheckOut":"2021-07-10","sChildrenAge":"","sKeyWord":"","sSortFlag":"DESC","sSortType":"hot","sTags":""}c9d6618dbc657b41a66eb0af952906f1'
            sn = self.par(room.encode('utf-8'))
            url = 'https://www.mafengwo.cn/hotel/ajax.php?iMddId={}&iAreaId=-1&iRegionId=-1&iPoiId=&position_name=&nLat=0&nLng=0&iDistance=10000&sCheckIn=2021-07-09&sCheckOut=2021-07-10&iAdultsNum=2&iChildrenNum=0&sChildrenAge=&iPriceMin=&iPriceMax=&sTags=&sSortType=hot&sSortFlag=DESC&has_booking_rooms=0&has_faved=0&sKeyWord=&iPage={}&sAction=getPoiList5&_ts={}&_sn={}'.format(
                a,i,t, sn)
            tasks.append(self.one_page(url))
        loop.run_until_complete(asyncio.wait(tasks))

多线程

以飞猪网站为例

    def one_page(self,page,total_item):
        print("开始爬取飞猪{}".format(page))
        _ksTS,callback = self.get_callback_ksts()
        url = 'https://hotel.fliggy.com/ajax/hotelList.htm?pageSize=20&currentPage={}&totalItem={}&startRow=0&endRow=19&city={}&tid=null&market=0&previousChannel=&u=null&detailLinkCity={}&cityName={}&checkIn={}&checkOut={}&browserUserAgent=Mozilla%2F5.0%20(Windows%20NT%2010.0%3B%20Win64%3B%20x64)%20AppleWebKit%2F537.36%20(KHTML%2C%20like%20Gecko)%20Chrome%2F89.0.4389.114%20Safari%2F537.36%20Edg%2F89.0.774.75&userClientIp=58.194.168.58&userSessionId=2794081654&offset=0&keywords=null&priceRange=R0&dangcis=null&brands=null&services=null&order=DEFAULT&dir=DESC&client=null&tagids=null&searchPoiName=undefined&pByRadiusLng=-1&pByRadiusLat=-1&radius=-1&pByRectMinLat=-1&pByRectMinLng=-1&pByRectMaxLat=-1&pByRectMaxLng=-1&lowPrice=-1&highPrice=-1&filterByKezhan=false&searchBy=&searchByTb=false&businessAreaId=null&skipKeywords=false&district=null&backCash=false&shids=null&activity=null&filterDoubleEleven=false&filterByRoomTickets=false&filterHxk=false&filterCxk=false&filterByRoomTicketsAndNeedLogin=false&filterByRoomTicketsAndNeedBuyTicket=false&activityCode=null&searchId=null&userId=null&hotelTypes=null&filterByB2g=false&filterByAgreement=false&bizNo=null&bizType=null&region=0&newYearSpeOffer=false&laterPay=false&sellerId=null&sellerIds=null&isMemberPrice=false&isLaterPayActivity=false&isFilterByTeHui=false&keyWordsType=null&userUniqTag=null&iniSearchKW=false&poiNameFilter=&isFreeCancel=false&isInstantConfirm=false&activityCodes=&adultChildrenCondition=%26roomNum%3D1%26aNum_1%3D2%26cNum_1%3D0&overseaMarket=false&roomNum=1&notFilterActivityCodeShotel=false&poisearch=false&totalPage=1042&previousPage=1&nextPage=2&pageFirstItem=1&firstPage=true&lastPage=false&pageLastItem=20&aNum_1=2&cNum_1=0&cAge_1_1=0&cAge_1_2=0&cAge_1_3=0&_input_charset=utf-8&laterPaySwitch&_ksTS={}&callback={}'.format(
            page, total_item, self.citycode, self.citycode, p.quote(self.city), self.checkin, self.checkout, _ksTS, callback)
        try:
            res = requests.get(url=url, headers=self.header)
            list = res.text
            ID_Name = re.findall(r'\"shid\"\:(.*?)\,\"name\"\:\"(.*?)\"\,', list)
            star = re.findall(r'\"desc\"\:\"(.*?)\"\,', list)
            count = re.findall(r'\"rateNum\"\:(.*?)\,', list)
            score = re.findall(r'\"rateScore\"\:\"(.*?)\"\,', list)
            price = re.findall(r'\"priceDesp\"\:\"(.*?)\"\,', list)
            '''
            print(len(ID_Name), len(star), len(score), len(price), len(count))
            print(ID_Name)
            print(star)
            print(score)
            print(price)
            print(count)
            '''
            print("完成飞猪{}".format(page))

        except:
            print(res.text)
            print("失败")


    def page(self):
        total,page = self.get_total_item()
        therad_list = []
        for i in range(1, 4, 1):
            t = threading.Thread(target=self.one_page, args=(i,total))
            therad_list.append(t)
        for t in therad_list:
            t.start()
        for t in therad_list:
            t.join()
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值