爬取链家合租房产数据

介绍一下用requests,xpath简单的爬取链家数据,并存入csv文件
代码如下:

# _*_ coding:utf-8 _*_
import requests
from lxml import etree
import csv

class LianjiaSpider(object):
    def __init__(self,num):
        # 请求头
        self.headers={'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'}
        # 地址链接
        self.base_url='https://hz.lianjia.com/zufang/pg'
        # 爬取页面的数量
        self.num=num


    def get_list(self):
        # 详情页链接的空列表
        detail_href_list = []

        for i in range(1,self.num):
            url=self.base_url+str(i)+'rt200600000002'
            response=requests.get(url=url,headers=self.headers)
            html_doc = etree.HTML(response.text)

            # 找到每一个详情页的链接
            detail_href = html_doc.xpath("//p[@class='content__list--item--title twoline']/a/@href")
            for i in detail_href:
                detail_href_list.append(i)
            set(detail_href_list)
            list(detail_href_list)
        return detail_href_list
    
    # 得到详情页的文档
    def get_detail(self,phref):
        response = requests.get(url=phref, headers=self.headers).text
        return response

    # 获取详情页的信息
    def detail_message(self,response):
        html_doc = etree.HTML(response)
        # 房屋名字
        try:
            room_name = html_doc.xpath("//div[@class='content clear w1150']/p/text()")[0]
            print(room_name)
        except:
            room_name=None
        # 上架时间
        try:
            room_shelf_time = html_doc.xpath("//div[@class='content__subtitle']/text()")[1].strip()
            print(room_shelf_time)
        except:
            room_shelf_time=None
        # 房源编号
        try:
            house_code = html_doc.xpath("//div[@class='content__subtitle']/i[@class='house_code']/text()")[0]
            print(house_code)
        except:
            house_code=None
        # 价格
        try:
            price_num = html_doc.xpath("//div[@class='content__aside fr']/p[@class='content__aside--title']/span/text()")[
                            0] + '元/月'
            print(price_num)
        except:
            price_num=None
        # 房源列表
        try:
            room_list = html_doc.xpath("//div[@class='content__aside fr']/p[@class='content__aside--tags']/i/text()")
            print(room_list)
        except:
            room_list=None
        # 租凭方式
        try:
            rental_way = html_doc.xpath("//ul[@class='content__aside__list']/p/span/text()")[0]
            print(rental_way)
        except:
            rental_way=None
        # 房源户型
        try:
            room_style = html_doc.xpath("//ul[@class='content__aside__list']/p/span/text()")[1]
            print(room_style)
        except:
            room_style=None
        # 面积
        try:
            area = html_doc.xpath("//ul[@class='content__aside__list']/p/span/text()")[2]
            print(area)
        except:
            area=None
        # 朝向
        try:
            toward = html_doc.xpath("//ul[@class='content__aside__list']/p/span/text()")[3]
            print(toward)
        except:
            toward=None
        # 管家
        try:
            house_manage = html_doc.xpath("//div[@class='content__aside__list--title oneline']/span/@title")[0]
            print(house_manage)
        except:
            house_manage=None
        # 管家电话
        try:
            housemanage_phone = html_doc.xpath("//p[@class='content__aside__list--bottom oneline']/text()")[0]
            print(housemanage_phone)
        except:
            housemanage_phone=None
        # 房屋基本信息
        try:
            room_message = html_doc.xpath("//div[@class='content__article__info']/ul/li/text()")
            print(room_message)
        except:
            room_message=None

        # 房源描述
        try:
            # room_describ = html_doc.xpath("//p[@data-el='houseComment']/text()")
            room_describ = html_doc.xpath("//div[@class='content__article__info3']//p[@data-el='houseComment']/text()")
            # room_describ=''.join(describ)
            print(room_describ)
        except:
            room_describ=None
       
       # 详情页信息的字典
        dict_new = {'room_name':room_name,'room_shelf_time':room_shelf_time,'house_code':house_code,
                    'price_num': price_num, 'room_list': room_list, 'rental_way': rental_way,
                    'room_style': room_style,'area': area, 'toward': toward,'house_manage':house_manage,
                    'housemanage_phone':housemanage_phone,'room_message':room_message,'room_describ':room_describ}


        return dict_new

    def supporting_facilities(self,response):
        html_doc = etree.HTML(response)
        """配套设施"""
        # 房间名字
        try:
            room_name = html_doc.xpath("//div[@class='content clear w1150']/p/text()")[0]
            print(room_name)
        except:
            room_name = None
        # 价格
        try:
            price_num = html_doc.xpath("//div[@class='content__aside fr']/p[@class='content__aside--title']/span/text()")[0] + '元/月'
            print(price_num)
        except:
            price_num = None
        # 电视
        try:
            tv = html_doc.xpath("//ul[@class='content__article__info2']/li[@class='fl oneline television ']/text()")[0]
            print(tv)
        except:
            tv = None
        # 冰箱
        try:
            refrigerator = html_doc.xpath(
                "//ul[@class='content__article__info2']/li[@class='fl oneline refrigerator ']/text()")[0]
            print(refrigerator)
        except:
            refrigerator = None
        # 洗衣机
        try:
            washing_machine = html_doc.xpath(
                "//ul[@class='content__article__info2']/li[@class='fl oneline washing_machine ']/text()")[0]
            print(washing_machine)
        except:
            washing_machine = None
        # 空调
        try:
            air_conditioner = html_doc.xpath(
                "//ul[@class='content__article__info2']/li[@class='fl oneline air_conditioner ']/text()")[0]
            print(air_conditioner)
        except:
            air_conditioner = None
        # 热水器
        try:
            water_heater = html_doc.xpath(
                "//ul[@class='content__article__info2']/li[@class='fl oneline water_heater ']/text()")[0]
            print(water_heater)
        except:
            water_heater = None
        # 床
        try:
            bed = html_doc.xpath(
                "//ul[@class='content__article__info2']/li[@class='fl oneline bed ']/text()")[0]
            print(bed)
        except:
            bed = None
        # 暖气
        try:
            heating = html_doc.xpath(
                "//ul[@class='content__article__info2']/li[@class='fl oneline heating ']/text()")[0]
            print(heating)
        except:
            heating = None
        # 宽带
        try:
            wifi = html_doc.xpath(
                "//ul[@class='content__article__info2']/li[@class='fl oneline wifi ']/text()")[0]
            print(wifi)
        except:
            wifi = None
        # 衣柜
        try:
            wardrobe = html_doc.xpath(
                "//ul[@class='content__article__info2']/li[@class='fl oneline wardrobe ']/text()")[0]
            print(wardrobe)
        except:
            wardrobe = None
        # 天然气
        try:
            natural_gas = html_doc.xpath(
                "//ul[@class='content__article__info2']/li[@class='fl oneline natural_gas ']/text()")[0]
            print(natural_gas)
        except:
            natural_gas = None

        # 详情页配套设施字典
        dict_supporting_facilities = {'room_name': room_name,'price_num':price_num, 'tv': tv, 'refrigerator': refrigerator,
                                      'washing_machine': washing_machine,'air_conditioner': air_conditioner,
                                      'water_heater': water_heater, 'bed': bed,'heating': heating, 'wifi': wifi,
                                      'wardrobe': wardrobe,'natural_gas': natural_gas}

        return dict_supporting_facilities

    # 保存到csv文件
    def save_csv(self,csv_writer,dict_new,csv_sfwriter,dict_supporting_facilities):

            csv_writer.writerow(dict_new.values())

            csv_sfwriter.writerow(dict_supporting_facilities.values())


    def start_work(self):
        # detail_href = self.get_list()
        detail_href_list = self.get_list()

        for href in detail_href_list:
            # 每个详情页的链接
            phref = 'https://hz.lianjia.com' + href
            
            response = self.get_detail(phref=phref)
            dict_new = self.detail_message(response=response)
            dict_supporting_facilities = self.supporting_facilities(response=response)
            
            csv_file = open('链家基本信息.csv', 'a+')
            csv_supporting_facilities = open('链家配套设施.csv', 'a+')
            
            csv_writer = csv.writer(csv_file)
            csv_sfwriter = csv.writer(csv_supporting_facilities)
            
            if phref == 'https://hz.lianjia.com/zufang/HZ2160567896500076544.html':
                csv_writer.writerow(dict_new.keys())
                csv_sfwriter.writerow(dict_supporting_facilities.keys())

            else:
                pass
        
            self.save_csv(dict_new=dict_new,dict_supporting_facilities=dict_supporting_facilities,csv_sfwriter=csv_sfwriter,csv_writer=csv_writer)

if __name__ == '__main__':
    num = int(input('请输入爬取页面数量:'))
    ds=LianjiaSpider(num)
    ds.start_work()




  • 1
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值