爬虫:链家房产租房信息深度爬取(运用redis和mysql)

“”"

1导入各种应用模块
import redis #用来进行redis数据库连接
import requests #requests 请求
from lxml import etree #运用xpath分析
from fake_useragent import UserAgent #随机产生请求头
import re #运用正则
import pymysql #用来连接mysql数据库

2获取指定url对应的xml界面(便于进行xpath分析)
方法可采用request或selenium 优先采用requests方法(速度快)
定义的函数属于 3 类

3 定义一个城市类(比如全国各个城市)
在其中定义函数用来获取城市信息,通过给定网站url
获取对应城市名称和url(可能url需要拼接)
将城市信息存入redis中(这样下次可以直接打开redis内容,不用再次请求原url等)

4定义一个城市对应各个区的类并继承城市类
通过城市url获取各区url(包括城市各区的名称)
具体方法同城市的获取

5定义一个专门获取信息等类(可在里面获取最大页,详情页分析)
一开始要连接数据库,不要最后连接
对4中城市各区信息进行循环获取
获取最大页码(可能需要拼接url)
通过5中分析获取指定分页url数据,之后缩小范围进行分析
比如获取图片,名称,描述,价格,城区,面积,房间信息,时间
获取详情页url 并拼接,将上边分析的数据放入一个字典中
单独定义详情页信息函数 (从城市各区跳转的详情界面)
获取里面需要的信息 方法同城市各区获取内容(有的信息可能存在接口中)
有时候需要异常处理
将详情页获取的信息放入以前字典中
插入到mysql数据库(需要将字典中的数据拿出来)(有事需要异常处理)
创建数据库连接对象
#注意:接口最好单独在一个函数中进行分析

“”"
对应完整代码:

 import redis
    import requests
    from lxml import etree
    from fake_useragent import UserAgent
    import re
    import pymysql


class CityArea:

    def __init__(self):
        # 初始化redis连接
        self.r = self.get_redis()

    def __call__(self, *args, **kwargs):
        self.get_city_area()

    # redis数据库连接
    def get_redis(self):
        return redis.Redis(host="127.0.0.1", port=6379, db=1)

    def get_city_area(self):
        # 获取城区信息
        base_url = "https://bj.lianjia.com/zufang/"
        html_xml = self.get_html(base_url)

        city_area_list = html_xml.xpath("//ul[@data-target='area']/li[position()>1]/a/@href | "
                       "//ul[@data-target='area']/li[position()>1]/a/text()")
        print(city_area_list)
        print(len(city_area_list))

        for city_area in city_area_list:
            if "zufang" in city_area:
                city_area = "https://bj.lianjia.com" + city_area
            print(city_area)
            # 将城区信息插入数据库
            self.r.rpush("city_area_list", city_area)

    # 获取指定url对应xml页面
    def get_html(self, url):
        headers = {"User-Agent": UserAgent().random}
        response = requests.get(url, headers=headers)
        html = response.text
        # print(html)
        return etree.HTML(html)


class BusinessCircle(CityArea):

    def __call__(self, *args, **kwargs):
        self.get_business_circle()

    # 通过城区url获取商圈url
    def get_business_circle(self):
        count = 1
        # 查询城区信息
        city_area_list = self.r.lrange("city_area_list", 0, -1)
        # print(city_area_list)
        for index in range(0, len(city_area_list), 2):
            # print(index)
            # 分别获取城区url和城区的名称
            city_area_url = city_area_list[index].decode("utf-8")
            city_area_name = city_area_list[index+1].decode("utf-8")
            print(city_area_url, city_area_name)

            # 获取城区url xml对象
            html_xml = self.get_html(city_area_url)
            # 获取商圈信息
            business_circle_list = html_xml.xpath("//div[@id='filter']/ul[4]/li[position()>1]/a/@href | "
                                                  "//div[@id='filter']/ul[4]/li[position()>1]/a/text()")

            print(business_circle_list)
            for index in range(len(business_circle_list)):
                # 获取商圈列表中的信息
                business_circle = business_circle_list[index]
                # 将城区和商圈用-连接起来 存入数据库
                if index % 2 == 1:
                    business_circle = city_area_name + "-" + business_circle_list[index]
                print(count, business_circle, type(business_circle))
                # print(type(business_circle))
                count += 1

                # 存入数据库
                self.r.rpush("business_circle_list", business_circle)

            # break


class Lian(CityArea):

    def __call__(self, *args, **kwargs):
        self.count = 1
        # 连接数据库 要在开始调用 不要放在最后
        self.conn_mysql()
        self.count_ucid = 1
        self.get_page_url()


    def get_page_url(self):
        # 查询数据库中的商圈信息
        business_circle_list = self.r.lrange("business_circle_list", 0, -1)
        # print(business_circle_list)
        # 循环获取商圈url
        for index in range(0, len(business_circle_list), 2):
            # 分别获取商圈url和商圈名称
            business_circle_url = business_circle_list[index].decode("utf-8")
            # 拼接完整的商圈url
            business_circle_url = "https://bj.lianjia.com" + business_circle_url
            business_circle_name = business_circle_list[index+1].decode("utf-8")
            print("==================={}开始下载====================".format(business_circle_name))
            print(business_circle_url, business_circle_name)
            # 获取商圈url指定xml页面
            html_xml = self.get_html(business_circle_url)

            # 获取最大页码
            max_page = html_xml.xpath("//div[@class='content__pg']/@data-totalpage")
            # 如果获取不到最大页码 则max_page 为空列表 然后跳过本次循环
            if not max_page:
                continue
            max_page = int(max_page[0])
            # print(max_page, type(max_page))

            # 循环生成分页url
            for page in range(1, max_page+1):
                # 拼接完整的分页url
                page_url = business_circle_url + "pg{}/".format(page)
                # print(page_url)
                # 获取数据
                self.get_data(page_url)
                # break
            # break

    # 获取指定分页url的数据
    def get_data(self, page_url):
        # 获取分页url页面
        html_xml = self.get_html(page_url)

        # 缩小范围
        div_list = html_xml.xpath("//div[@class='content__list']/div")

        for div in div_list:
            # 图片
            pic = div.xpath(".//img/@data-src")[0]
            pic = pic.replace("250x182", "2500x1800")
            # print(pic)

            # 标题
            title = div.xpath(".//p[@class='content__list--item--title twoline']/a/text()")[0].strip()
            # print(title)

            # 城区
            city_area = div.xpath(".//p[@class='content__list--item--des']/a[1]/text()")[0]

            # 商圈
            business_circle = div.xpath(".//p[@class='content__list--item--des']/a[2]/text()")[0]
            # print(city_area, business_circle)

            # 面积
            area = div.xpath(".//p[@class='content__list--item--des']//text()[4]")
            area = area[0].strip() if area else ""  # 空值处理
            # print(area)

            # 朝向
            toward = div.xpath(".//p[@class='content__list--item--des']//text()[5]")[0].strip()
            # print(toward)

            # 房间信息
            fang_info = div.xpath(".//p[@class='content__list--item--des']//text()[6]")[0].strip()
            # print(fang_info)
            room = re.findall("(\d+)室", fang_info)  # 室
            hall = re.findall("(\d+)厅",fang_info)  # 厅
            toilet = re.findall("(\d+)卫", fang_info)  # 卫
            # 空值处理
            room = int(room[0]) if room else 0
            hall = int(hall[0]) if hall else 0
            toilet = int(toilet[0]) if toilet else 0
            # print(room, hall, toilet)

            # 发布时间
            publish_date = div.xpath(".//p[@class='content__list--item--time oneline']/text()")[0]
            # print(publish_date)

            # 标签
            sign_list = div.xpath(".//p[@class='content__list--item--bottom oneline']/i/text()")
            # print(sign_list)
            # 将标签转换为字符串
            sign = "#".join(sign_list)
            # print(sign)

            # 价格
            price = div.xpath(".//em/text()")[0]
            # print(price)

            # 详情url
            detail_url = div.xpath(".//p[@class='content__list--item--title twoline']/a/@href")[0]
            # 拼接完整的详情url
            detail_url = "https://bj.lianjia.com" + detail_url
            # print(detail_url)

            fang_dict = {
                "pic": pic, "title": title, "city_area": city_area, "business_circle": business_circle,
                "area": area, "toward": toward, "room": room, "hall": hall, "toilet": toilet,
                "publish_date": publish_date, "sign": sign, "price": price, "detail_url": detail_url
            }

            self.parse_detail(fang_dict)

    # 解析详情页
    def parse_detail(self, fang_dict):

        # print(fang_dict)
        detail_url = fang_dict['detail_url']
        # print(detail_url)

        # 获取详情url对应的xml对象
        html_xml = self.get_html(detail_url)

        floor = html_xml.xpath("//ul/li[@class='fl oneline'][8]/text()")
        floor = floor[0] if floor else ""
        # print(floor)

        # 获取经纪人电话号码 不在页面中
        # 电话号码在接口中
        # phone = html_xml.xpath(".//p[@class='content__aside__list--bottom oneline phone']/text()")
        # print(phone)

        # 获取经纪人id号 ucid
        ucid = self.get_ucid(html_xml)
        # print(ucid)
        # 获取house_code
        house_code = re.findall("zufang/(.*?).html", detail_url)[0]
        # print(house_code)

        # 拼接完整的经纪人接口
        agent_url = "https://bj.lianjia.com/zufang/aj/house/brokers?" \
                    "house_codes={}&position=bottom" \
                    "&ucid={}".format(house_code,ucid)
        # print(agent_url)
        try:
            # 获取接口中的信息
            headers = {"User-Agent": UserAgent().random}
            json_data = requests.get(agent_url, headers=headers).json()
            # print(json_data)
            phone = json_data.get("data")[house_code][house_code].get("tp_number")

            # print(phone)
        except Exception as e:
            print(e)
            phone = ''

        # 将电话和楼层信息放到fang_dict中
        fang_dict["floor"] = floor
        fang_dict["phone"] = phone

        self.insert_mysql(fang_dict)

    def insert_mysql(self, fang_dict):

        # 将数据拿取出来 放入sql语句中
        pic = fang_dict["pic"]
        title = fang_dict["title"]
        city_area = fang_dict["city_area"]
        business_circle = fang_dict["business_circle"]
        area = fang_dict["area"]
        toward = fang_dict["toward"]
        room = fang_dict["room"]
        hall = fang_dict["hall"]
        toilet = fang_dict["toilet"]
        publish_date = fang_dict["publish_date"]
        sign = fang_dict["sign"]
        price = fang_dict["price"]
        detail_url = fang_dict["detail_url"]
        floor = fang_dict["floor"]
        phone = fang_dict["phone"]

        sql = '''
        insert into lianjia (pic, title, city_area, business_circle, area, toward, room, 
        hall, toilet, publish_date, sign, price, detail_url, floor, phone) values 
        ("{}", "{}", "{}", "{}", "{}", "{}", 
        {}, {}, {}, "{}", "{}", "{}", "{}", 
        "{}", "{}")
        '''.format(pic,title,city_area,business_circle,area,toward,room,hall,toilet,publish_date,sign,price,detail_url,floor,phone)

        try:
            # 执行sql语句并提交
            self.cur.execute(sql)
            self.conn.commit()
            print(self.count, sql)
            self.count += 1
        except Exception as e:
            print(e)
            # 回滚 执行sql语句 要么执行成功 要么执行失败
            self.conn.rollback()

    def conn_mysql(self):
        # 创建数据库的连接对象
        # 字符集是utf8 不是utf-8
        self.conn = pymysql.connect(host="127.0.0.1", user="root",password='123',
                                    database="刘争", charset="utf8")
        # 创建操作数据库的对象
        self.cur = self.conn.cursor()

    def get_ucid(self, html_xml):

        try:
            ucid = html_xml.xpath("//span[@class='contact__im im__online']/@data-info")[0]
            # print(ucid)
            self.count_ucid = 1
            return ucid
        except Exception as e:
            print(e)
            if self.count_ucid == 3:
                return ""
            else:
                self.count_ucid += 1
                return self.get_ucid(html_xml)


# ucid = self.get_ucid() = self.get_ucid(html_xml) = ucid


if __name__ == '__main__':
    # cityarea = CityArea()
    # cityarea()
    #实例化BusinessCircle bc为当前类的对象 调用时触发__call__
    # bc = BusinessCircle()
    # bc()
    lian = Lian()
    lian()

‘’’
电话接口分析:
https://bj.lianjia.com/zufang/aj/house/brokers?house_codes=BJ2259333770690183168&position=bottom&ucid=1000000026012783
https://bj.lianjia.com/zufang/aj/house/brokers?house_codes=BJ2234691835526389760&position=bottom&ucid=1000000023002201

‘’’

  • 1
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

小小争520

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值