python爬虫---爬取链家新房

import re
import requests
import redis
from lxml import etree
from fake_useragent import UserAgent
import re
import pymysql
class CoityArea:
    def __init__(self):
        #初始化Redis链接
       self.r=self.get_redis()
    def __call__(self, *args, **kwargs):
       self.get_city_area()
    #redis数据库连接
    def get_redis(self):
        return  redis.Redis(host='127.0.0.1',port=6379,db=1)
    def get_city_area(self):
        base_url="https://bj.lianjia.com/zufang/"
        html_xml=self.get_html(base_url)
        city_area_list=html_xml.xpath('//ul[@data-target="area"]//li[position()>1]/a/text() | //ul[@data-target="area"]//li[position()>1]/a/@href')
        # print(city_area_list)
        # print(len(city_area_list))
        for city_area in city_area_list:
            if "zufang" in city_area:
                city_area="https://bj.lianjia.com"+city_area
                # print(city_area)
            #将城区信息插入数据库
            self.r.rpush("city_area_list",city_area)
    #获取指定的url对应的html页面
    def get_html(self,url):
        headers={
            'User-Agent':UserAgent().random
        }
        response = requests.get(url,headers=headers)
        html=response.text
        # print(html)
        return etree.HTML(html)
class BusinessCicle(CoityArea):
    def __call__(self, *args, **kwargs):
        self.get_business_cicle()
    #通过城区的url获取商圈的url
    def get_business_cicle(self):
        #查询城区信息
        city_area_list=self.r.lrange("city_area_list",0,-1)
        # print(city_area_list)
        for index in range(0,len(city_area_list),2):
            #获取城区的url和城区的名称
            city_area_url=city_area_list[index].decode('utf-8')
            city_area_name=city_area_list[index+1].decode('utf-8')
            # print(city_area_url)
            # print(city_area_name)
            #获取城区的html_xml对象
            html_xml=self.get_html(city_area_url)
            #获取商圈信息
            business_cicle_list=html_xml.xpath('//ul[@data-target="area"]/li[@data-type="bizcircle"][position()>1]/a/@href | //ul[@data-target="area"]/li[@data-type="bizcircle"][position()>1]/a/text()')
            # print(business_cicle_list)
            for index in range(len(business_cicle_list)):
                #将城区和商圈用“-”链接起来
                business_cicle=business_cicle_list[index]
                if index%2==1:
                    business_cicle=city_area_name+"-"+business_cicle_list[index]
                # print(business_cicle)
                # count+=1
                self.r.rpush("business_cicle",business_cicle)
class Lian(CoityArea):
    def __call__(self, *args, **kwargs):
        self.count = 1
        self.conn_mysql()
        self.count_ucid=1
        self.get_page_url()

    def get_page_url(self):#查询数据库的商圈信息。
        business_cicle_list=self.r.lrange("business_cicle",0,-1)
        # print(business_cicle_list)
        for index in range(0,len(business_cicle_list),2):
            business_cicle_url=business_cicle_list[index].decode("utf-8")
            business_cicle_name=business_cicle_list[index+1].decode('utf-8')
            #拼接完整的商圈url
            business_cicle_url="https://bj.lianjia.com"+business_cicle_url
            # print('================={}开始下载==================='.format(business_cicle_name))
            # print(business_cicle_url,business_cicle_name)
            html_xml=self.get_html(business_cicle_url)
            #获取做大的页码
            max_page=html_xml.xpath('//div/@data-totalpage')
            # print(max_page)
            #如果获取不到最大页码,则max_page为空列表,跳过本次循环。
            if not max_page:
                continue
            max_page=int(max_page[0])
            # print(max_page,type(max_page))
            #循环生成分页url
            for page in range(1,max_page+1):
                print('============第{}页开始下载============='.format(page))
                page_url=business_cicle_url+"pg{}".format(page)
                # print(page_url)
                self.get_data(page_url)
                # break
            # break
    def get_data(self,page_url):#获取数据:
        html_xml=self.get_html(page_url)
        #缩小范围
        div_list_all=html_xml.xpath('//div[@class="content__list"]//div[@class="content__list--item"]')
        for div_list in div_list_all:
            #获取图片信息:
            pic=div_list.xpath('.//img/@data-src')[0]
            floor_pic=pic.replace('250x182','2000x1200')
            # print(floor_pic)
            #获取标题
            floor_name = div_list.xpath('.//img/@alt')[0]
            # print(floor_name)
            # 3 获取价格:
            floor_price = div_list.xpath('.//span[@class="content__list--item-price"]/em/text()')[0]
            # print(floor_price)
            # 4 获取标签:
            floor_lable = div_list.xpath(".//p[@class='content__list--item--bottom oneline']/i/text()")
            floor_lable = "/".join(floor_lable)#将标签转换为字符串格式
            # print(floor_lable)
            # 5 发布时间;
            floor_time = div_list.xpath(".//p[@class='content__list--item--time oneline']/text()")
            floor_time = floor_time[0] if floor_time else ''
            # print(floor_time)
            # 6 获取位置信息,
            floor_city = div_list.xpath('.//p[@class="content__list--item--des"]//text()')[1]#区的信息
            floor_local = div_list.xpath('.//p[@class="content__list--item--des"]//text()')[3]#商圈的信息
            floor_area = div_list.xpath('.//p[@class="content__list--item--des"]//text()')[6].strip()#房屋的面积
            floor_toward = div_list.xpath('.//p[@class="content__list--item--des"]//text()')[8].strip()#朝向
            floor_room = div_list.xpath('.//p[@class="content__list--item--des"]//text()')[10].strip()#房屋
            # print(floor_room)
            room=re.findall("(\d+)室",floor_room)
            hall=re.findall("(\d+)厅",floor_room)
            tolit=re.findall("(\d+)卫",floor_room)
            room=room[0] if room else 0
            hall=hall[0] if hall else 0
            tolit=tolit[0] if tolit else 0
            # print(room,hall,tolit)

            #详情url:
            detail_url=div_list.xpath('.//p[@class="content__list--item--title twoline"]//a[                      @target="_blank"]/@href')[0]
            # print(detail_url)
            detail_url="https://bj.lianjia.com"+detail_url
            # print(detail_url)
            fang_dict={
                "floor_pic":floor_pic,"floor_name":floor_name,"floor_price":floor_price,
                "floor_lable":floor_lable,"floor_time":floor_time,"floor_city":floor_city,
                "floor_local":floor_local,"floor_area":floor_area,"floor_toward":floor_toward,
                "room":room,"hall":hall,"tolit":tolit,"detail_url":detail_url
            }
            self.parse_detail_info(fang_dict)
    def parse_detail_info(self,fang_dict):
        # print(fang_dict)
        detail_url=fang_dict['detail_url']
        print(detail_url)
        #获取详情url对应的xml对象
        html_xml=self.get_html(detail_url)
        floor = html_xml.xpath('//ul/li[@class="fl oneline"][8]/text()')
        floor=floor[0] if floor else ''
        # print(floor)
        #获取经纪人电话号码:,不在页面中,电话号码在接口中
        # phone=html_xml.xpath('//p[@class="content__aside__list--bottom oneline phone"]//text()')
        # print(phone)
        ucid_id=self.get_ucid(html_xml)
        # print(ucid_id)
        #获取house_code
        house_code = re.findall('zufang/(.*?).html',detail_url)[0]
        # print(house_code)
        #拼接完整的经纪人url
        agent_url="https://bj.lianjia.com/zufang/aj/house/brokers?house_codes={}&position=bottom&ucid={}".format(house_code,ucid_id)
        #获取接口的信息
        try:
            headers={'User-Agent':UserAgent().random}
            json_data=requests.get(agent_url,headers=headers).json()
            # print(json_data)
            phone = json_data.get("data")[house_code][house_code].get("tp_number")
            # print(phone)
        except Exception as e:
            print(e)
        #将电话和字典放在字典当中:
        else:
            fang_dict['floor']=floor
            fang_dict['phone']=phone
            self.insert_mysql(fang_dict)
    def insert_mysql(self,fang_dict):
        #将数据拿取出来放入sql语句中
        floor_pic=fang_dict['floor_pic']
        floor_name=fang_dict['floor_name']
        floor_price=fang_dict['floor_price']
        floor_lable=fang_dict['floor_lable']
        floor_time=fang_dict['floor_time']
        floor_city=fang_dict['floor_city']
        floor_local=fang_dict['floor_local']
        floor_area=fang_dict['floor_area']
        floor_toward=fang_dict['floor_toward']
        room=fang_dict['room']
        hall=fang_dict['hall']
        tolit=fang_dict['tolit']
        detail_url=fang_dict['detail_url']
        floor=fang_dict['floor']
        phone=fang_dict['phone']

        sql="""
        insert into lianjia (floor_pic, floor_name, floor_price, floor_lable, floor_time, floor_city, 
 floor_local, floor_area, floor_toward, room, hall, tolit, detail_url,floor,phone) values 
("{}", "{}", "{}", "{}", "{}", "{}", "{}", "{}", "{}", {}, {}, {}, "{}", "{}", "{}")
        """.format(floor_pic,floor_name,floor_price,floor_lable,floor_time,floor_city,floor_local,floor_area,floor_toward,room,hall,tolit,detail_url,floor,phone)
        # print(sql)
        # try:
        #     self.cur.execute(sql)
        #     self.conn.commit()
        #     print(self.count,sql)
        #     self.count+=1
        # except Exception as e:
        #     print(e)
        #     self.conn.rollback()

    def conn_mysql(self):
        #创建数据库连接对象
        self.conn=pymysql.Connect(host='127.0.0.1',user='root',password='admin',database='02180530',charset='utf8')
        #创建操作数据库对象
        self.cur=self.conn.cursor()
    def get_ucid(self,html_xml):
        try:
            ucid_id =html_xml.xpath('//div[@class="phone__hover--wrapper"]/span[@class="contact__im im__online"]/@data-info')[0]
            # print(ucid_id)
            self.count_ucid=1
            return ucid_id
        except Exception as e:
            print(e)
            if self.count_ucid==3:
                return ''
            else:
                self.count_ucid+=1
                return self.get_ucid(html_xml)
if __name__ == '__main__':
    # city=CoityArea()
    # city()
    #实例化BusinessCicle bc为当前类,调用时触发
    # bc=BusinessCicle()
    # bc()
    lian=Lian()
    lian()

"""
https://bj.lianjia.com/zufang/aj/house/brokers?house_codes=BJ2262932561259143168&position=bottom&ucid=1000000023007453

https://bj.lianjia.com/zufang/aj/house/brokers?house_codes=BJ2259430864331218944&position=bottom&ucid=1000000020276829

"""
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值