爬取链家所有地区的房屋信息

from selenium import webdriver
import time
from lxml import etree
import requests
def iskong(a):
    if len(a):
        return a
    else:
        return ''

#初始URL
url = 'https://bj.fang.lianjia.com/'
heard = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36',
}
response = requests.get(url,headers=heard).content.decode('utf-8')
diquxinxi  = {}
#构建每个地区房间的树
tree = etree.HTML(response)
daqu = tree.xpath('//li/div[@class="city-enum fl"]')
for xiaoqu in daqu:
    difang_name = xiaoqu.xpath('./a/text()')

    difang_a = xiaoqu.xpath('./a/@href')
    for i in range(len(difang_name)):
        diquxinxi[difang_name[i]] = 'https:'+difang_a[i]
#把地区信息装在字典中

for i,j in diquxinxi.items():
    print("开始获取"+i+"楼盘的网页")
    response = requests.get(j,headers=heard).content.decode('utf-8')
    tree = etree.HTML(response)
    xinfang = tree.xpath('//li/a[@data-other-action=2]/@href')[0]
    #进入楼盘链接:
    n=1
    while True:
        loupan = j+xinfang+'pg{}'+str(n)

        response = requests.get(loupan,headers=heard).content.decode('utf-8')
        print("爬取成功-----正在加载")
        tree = etree.HTML(response)
        fangwu = tree.xpath('//ul[@class="resblock-list-wrapper"]/li[@class="resblock-list post_ulog_exposure_scroll has-results"]')

        if len(fangwu) == 0 :
            print(str(i)+'楼盘爬取结束')
            break
        #下面是每一个楼盘的每一个,也是我们需要的楼盘列表页
        base_url = 'https://bj.fang.lianjia.com'
        #图片
        tupian = fangwu[n-1].xpath('.//a/img/@src')
        print("图片:",tupian[0])
        #详情页
        xiangqin = fangwu[n-1].xpath('.//div[@class="resblock-name"]/a/@href')
        print("详情页:",base_url+xiangqin[0])
        #楼盘名称
        a = fangwu[n-1].xpath('.//div[@class="resblock-name"]/a/text()')
        print("楼盘名称:",a[0])
        #城区
        chengqu = fangwu[n-1].xpath('.//div[@class="resblock-location"]/span/text()')
        print("城区:",chengqu[0])
        #地址
        dizhi = fangwu[n-1].xpath('.//div[@class="resblock-location"]/a/text()')
        print("地址:",dizhi[0])
        #主力户型:
        zhuli = fangwu[n-1].xpath('.//a[@class="resblock-room"]/span/text()')
        print("主力户型:",''.join(i for i in zhuli))
        #建筑面积:
        jianmian = fangwu[n-1].xpath('.//div[@class="resblock-area"]/span/text()')
        if len(jianmian):
            print("建筑面积:",iskong(jianmian[0]))
        else:
            print("建筑面积未知")
        #jiage
        jiage = fangwu[n-1].xpath('.//div[@class="main-price"]/span[@class="number"]/text()')
        danwei = fangwu[n-1].xpath('.//div[@class="main-price"]/span[@class="desc"]/text()')
        if len(jiage) and len(danwei):
            print("均价:",iskong(jiage[0])+iskong(danwei[0]))
        else:
            print("均价异常")

        #下面是单个的房子直接可以进入房子的详情页
        for fangzi in xiangqin:
            url = base_url+fangzi
            response=requests.get(url).content.decode('utf-8')
            #已经进入详情页:
            tree = etree.HTML(response)
            title = tree.xpath('//h1/text()')
            print("正在爬取",title[0],"的界面")



            fangwuxinxi ={}
            #动态
            print("开始爬取楼盘动态")
            dongtai = []#这是我们的动态字典#后添加
            #进去动态的详情页
            dongtaixiangqin = tree.xpath('.//a[@class="dynamic-more pull-right"]/@href')
            if len(dongtaixiangqin)==0:
                dongtai.append("没有任何动态")
                print(dongtai[0])
            else:

                response = requests.get(url+'dongtai/',headers = heard).content.decode('utf-8')
                tree = etree.HTML(response)
                title = tree.xpath('//span[@class="a-title"]/text()')
                content = tree.xpath('//a[@onclick="return false"]/text()')
                date = tree.xpath('//span[@class="a-time"]/text()')
                xiaokuangkuang={}
                for i in range(len(date)):
                    xiaokuangkuang['title']=title[i]
                    xiaokuangkuang['content'] = content[i]
                    xiaokuangkuang['date'] = date[i]
                    dongtai.append(xiaokuangkuang)
                    guang = dongtai[i]#记住!在这里面保存
                fangwuxinxi={"动态":dongtai}

            #户型:
            print("开始爬取户型")
            huxing = []
            huxing_list = tree.xpath('//div[@class="houselist"]/ul')
            for i in huxing_list:
                pic = i.xpath('.//li/img/@src')
                print("pic:",pic[0])
                fangzi = i.xpath('.//li[@class="info-li"]/p[@class="p1"]/text()')[0]

                shi = fangzi[0]
                ting = fangzi[2]
                ce = fangzi[4]
                print("室:",iskong(shi))
                print("厅:",iskong(ting))
                print("厕所:",iskong(ce))

                area = i.xpath('.//li[@class="info-li"]/p[@class="p1"]/span/text()')[0]
                print("面积",iskong(area))
                price = i.xpath('.//li[@class="info-li"]/p[@class="p2"]/span/text()')
                print("价格",iskong(price))
                date = i.xpath('.//li[@class="info-li"]/p[@class="p2"]/span[@class="p2-time"]/text()')
                print("时间:",iskong(date))
                type_desc = i.xpath('.//li[@class="info-li"]/p[@class="p3"]/span/text()')
                print("解读:",iskong(type_desc))
                huxing= {
                    "pic": pic,
                    "room": iskong(shi),  # 室
                    "ting": iskong(ting),  # 厅
                    "wei": iskong(ce),  # 卫
                    "area": iskong(area),  # 建筑面积
                    "price": iskong(price),
                    "date": iskong(date),
                    "type_desc": iskong(type_desc),  # 户型解读
                },
                fangwuxinxi["户型"]=huxing

            #图片相册
            print("正在爬取图片相册")
            pic={}
            xiaoguotu = tree.xpath('.//div[@class="album-list-item pull-left"]')
            if len(xiaoguotu)==0:
                print("效果图不存在")
            else:
                xiaoguotu = xiaoguotu[0].xpath('.//a/@href')
                b = []
                for i in xiaoguotu:
                    b.append('https://bd.fang.lianjia.com' + i)
                pic["效果图"] = b

            shijin = tree.xpath('.//div[@class="album-list-item pull-left"]')
            if len(shijin)==0:
                print("实景图不存在")
            else:

                shijin = shijin[1].xpath('.//a/@href')

                c = []
                for i in shijin:
                    c.append('https://bd.fang.lianjia.com' + i)

                pic["实景图"] = c

            print(pic)
            #楼盘详情
            print("正在爬取楼盘详情")
            p_list = tree.xpath('.//p[@class="desc-p clear"]/span/text()')

            lp_dic={}
            c = []
            for i in range(0,len(p_list)):
                if i % 2 !=0:
                    value=p_list[i]
                    c.append(value.replace(" ","").replace("\n",""))
                else:
                    key = p_list[i]
                    lp_dic[key] = ''
            x = 0
            for i in lp_dic:
                lp_dic[i] = c[x]
                x+=1
            print(lp_dic)
            fangwuxinxi["楼盘详情"]=lp_dic

            print("一个房屋的所有信息时:",fangwuxinxi)

        n+=1















 

  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值