python-爬虫-到喜啦酒店信息抓取

到喜啦 酒店信息抓取

1.招商需要成都的酒店信息,那么时间比较短
上代码

import requests
import pymysql
from lxml import etree
import re

class DXL:
    def __init__(self):
        self.headers = {
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
            'Accept-Encoding': 'gzip, deflate, br',
            'Accept-Language': 'zh-CN,zh;q=0.9',
            'Cache-Control': 'max-age=0',
            'Connection': 'keep-alive',
            'Cookie': '_da=DA.234542667.1565251587567; city_id=144; city=cd; _ga=GA1.2.1039941117.1565251588; PHPSESSID=SqH33WnW595aGEfx-wV8hZT0Cw; _uab_collina=156525163408777721466634; _gid=GA1.2.1336936516.1565318781; utm_source=seo; utm_medium=pc_baidu_seo; utm_campaign=; utm_term=%E5%88%B0%E5%96%9C%E5%95%A6; utm_content=; a_i=; _ba=778128998647663; Hm_lvt_1d2519efe52fbcddc471e1b2ee80eb9e=1565251588,1565318781,1565332693,1565335299; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%2216c7043d0a58d8-0b70f68a79097c-3c375f0d-2073600-16c7043d0a7aab%22%2C%22%24device_id%22%3A%2216c7043d0a58d8-0b70f68a79097c-3c375f0d-2073600-16c7043d0a7aab%22%2C%22props%22%3A%7B%22%24latest_referrer%22%3A%22https%3A%2F%2Fsp0.baidu.com%2F9q9JcDHa2gU2pMbgoY3K%2Fadrc.php%3Ft%3D06KL00c00f7-cK30swF-0QSDcsjeZbdI00000KGDr-C00000V3ZsCC.THvdETSzsQc0UWdBmy-bIy9EUyNxTAT0T1Y3rHckuh7huH0snAf4rAfs0ZRqwbn3PW9afYnkPH6vrjRLn1c1nYnsnbR%22%2C%22%24latest_referrer_host%22%3A%22sp0.baidu.com%22%2C%22%24latest_traffic_source_type%22%3A%22%E5%BC%95%E8%8D%90%E6%B5%81%E9%87%8F%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC%22%7D%7D; Hm_lpvt_1d2519efe52fbcddc471e1b2ee80eb9e=1565335886',
            'Upgrade-Insecure-Requests': '1',
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36',
        }
        self.url = 'https://cd.daoxila.com/HunYan/'
        self.db = pymysql.connect(host='172.16.201.20', port=3306, user='xiaoshubiao', passwd='654321', db='sina',
                                  charset='utf8mb4')
        self.cursor = self.db.cursor()

    def shop_mess(self):
        Area = ['WuHou', 'JinNiu', 'QingYang', 'JinJiang',
                'ChengDuGaoXin', 'ChengHua', 'ChengDuJinJiao','HuaYang', 'XinDu', 'WenJiang', 'ShuangLiuQu', 'LongQuan', 'DuJiangYan']
        for A in Area:
            url = self.url + A
            for i in range(1, 11):
                fin_url = url + '/page-' + str(i)
                print(fin_url)
                response = requests.get(fin_url, headers=self.headers)
                page = etree.HTML(response.text)
                href = page.xpath('//*[@id="hotelView"]/li/article/div/div[1]/h2/a/@href')
                area = page.xpath('//*[@id="hotelView"]/li/article/div/div[2]/i[5]/text()')
                for h, a in zip(href, area):
                    a = re.search('\S+', a).group()
                    in_url = 'https://cd.daoxila.com' + h
                    self.shop_info(in_url, a)




    def shop_info(self, in_url, area):
        item = {}
        response = requests.get(in_url, headers=self.headers)
        page = etree.HTML(response.text)
        item['area'] = area
        item['hall_num'] = page.xpath('/html/body/div[1]/div[2]/dl/dd[1]/text()')[0]
        item['shop_type'] = page.xpath('/html/body/div[1]/div[1]/ul/li[2]/a/text()')[0]
        item['store_name'] = page.xpath('/html/body/div[1]/div[2]/h1/text()')[0]
        item['price'] = page.xpath('/html/body/div[1]/div[2]/dl/dd[4]/text()')[0]
        item['address'] = page.xpath('/html/body/div[1]/div[2]/div[1]/text()')[0]
        item['table_max'] = page.xpath('/html/body/div[1]/div[2]/dl/dd[3]/text()')[0]
        self.insert_sql('chengdu_hall', item)

    def insert_sql(self, dbname, item):
        key = ''
        value = ''
        for k, v in item.items():
            key += k + ','
            v = str(v).replace('"', "'")
            value += '"' + str(v) + '"' + ','
        sql = 'insert into %s(%s)value(%s)' % (dbname, key[:-1], value[:-1])
        try:
            self.cursor.execute(sql)
            self.db.commit()
        except Exception as e:
            print(e)


    def insert(self):
        sql = """select id, price,hall_num,table_max from chengdu_hall"""
        self.cursor.execute(sql)
        ress = self.cursor.fetchall()
        for res in ress:
            aa = re.findall('\d+', res[1])
            price = aa[0] + '-' + aa[1] + '/桌'
            hall_num = re.findall('\d+',res[2])[0]
            bb = re.findall('\d+', res[3])
            print(res[3])
            print(bb)
            table_max = bb[1]
            up = """update chengdu_hall set price = '{}', hall_num = '{}', table_max = '{}' where id = '{}'""".format(price,hall_num,table_max,res[0])
            print(up)
            self.cursor.execute(up)
            self.db.commit()




if __name__ == '__main__':
    d = DXL()
    d.insert()

以上就是基本的到喜啦酒店信息抓取过程

  • 1
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值