【Python 爬虫小项目 01】租房数据

思路描述:
整体观测:某网后台数据是自动从数据库里抓取的;比如我选中【深圳】-【南山区】-【竹子林】,共有250套房源,每页呈现30套房源数据,这30套在每一次点击进去数据都会略有不同。
url方面:采用静态编号pg[ ]代表页数,zufang/后面携带商圈&行政区的拼音编码;所以首先根据所爬内容创建一个商圈list;之所以没用行政区纬度去采集是因为商圈的范围小,为了保障数据的完整性,在小商圈中采集效果更好。
采集内容方面: 值得注意的是我们在整租房源下进行采集,整租房源编号在url上保持不变就可以;为了保障数据采集且能够有效去重,在图片命名上能够采集到一个类似房源编号的东西:SZXXXXXXXXX;这个数据能够帮我们做唯一性校验;
基本方法:request + beautifulsoup

引入使用的包

import requests
import time
from bs4 import BeautifulSoup as bs
import pandas as pd

设置一个商圈List

#商圈名称 手动一个个查验,因为有的商圈名称不仅仅是拼音还带数字(全国重名的商圈)
business_area = ['buxin','baishida','cuizhu','chunfenglu','dongmen','diwang','honghu','huangmugang','liantang','luohukouan','luoling','qingshuihe','sungang','wanxiangcheng','xinxiu',
                 'yinhu','bagualing','baihua','chegongmiao','chiwei','futianbaoshuiqu','futianzhongxin','huanggang','huangmugang','huaqiangbei','huaqiangnan','jingtian','lianhua','meilin','shixia','shangxiasha',
                 'shawei','shangbu','xiangmihu','xiangmeibei','xinzhou1','yuanling','yinhu','zhuzilin',
                 'baishizhou','daxuecheng3','hongshuwan','houhai','huaqiaocheng1','kejiyuan','nanshanzhongxin','nantou','qianhai','shekou','shenzhenwan','xili1',
                 'baoanzhongxin','bihai1','fanshen','fuyong','songgang','shajing','shiyan','taoyuanju','xinan','xicheng1','xixiang',
                 'bantian','guanlan','hongshan6','longhuazhongxin','longhuaxinqu','minzhi','meilinguan','shangtang','shiyan',
                'bujiguan','bujidafen','bujishuijing','bujishiyaling','bujijie','bujinanling','danzhutou','dayunxincheng','henggang','longgangshuanglong','longgangzhongxincheng','longgangbaohe','pingdi','pinghu']
#Count 一共采集多少个商圈
len(business_area)

采集数据封包

#方案(省去清洗) 
def house_info_2(single_soup):
    # 设立一个List
    house_info = []
    # 整个页面的data Elements
    total_div = single_soup.find_all('div',class_="content__list--item--main")
    for house in total_div:
        try:
            #房源编号
            house_url = house.find('a',class_='twoline').get('href').replace('/zufang/','').replace('.html','')
            #房屋名称
            house_title = house.find('a',class_="twoline").text.split(" ")[10].replace('整租·','')
            #房屋户型
            house_type = house.find('a',class_="twoline").text.split(" ")[11]
            #房屋格局(复式、跃层等等)
            house_aspect = house.find('a',class_="twoline").text.split(" ")[12]
            #房屋行政区
            house_district = house.find_all('a',target="_blank")[1].text
            #房屋商圈
            house_business_area =house.find_all('a',target="_blank")[2].text
            #楼盘名称
            resblock_name = house.find_all('a',target="_blank")[3].text
            #房屋面积
            house_area = house.find('p',class_="content__list--item--des").text.split("/")[1].replace(" ","").replace('\n', '').replace('㎡','')
            #房屋朝向(东南西北)
            house_aspect_2 = house.find('p',class_="content__list--item--des").text.split("/")[2].replace(" ","").replace('\n', '')
            #房屋卧室数
            house_bedroom_nums = house.find('p',class_="content__list--item--des").text.split("/")[3].replace(" ","").replace('\n', '').split('室')[0]
            #房屋客厅数
            house_living_room_nums = house.find('p',class_="content__list--item--des").text.split("/")[3].replace(" ","").replace('\n', '').split('室')[1].split('厅')[0]
            #房屋卫生间数
            house_restroom_nums = house.find('p',class_="content__list--item--des").text.split("/")[3].replace(" ","").replace('\n', '').split('室')[1].split('厅')[1].replace('卫','')
            #房屋楼层
            house_floor = house.find('p',class_="content__list--item--des").text.split("/")[4].replace(" ","").replace('\n', '')
            #标签Label
            house_label = house.find('p',class_="content__list--item--bottom oneline").text.replace('\n', ' ')
            #是否品牌方-Brand
            house_brand = house.find('p',class_="content__list--item--brand oneline").text.replace('\n', ' ')
            #价格Price
            house_price = house.find('span',class_="content__list--item-price").text.replace(' 元/月','')
            # 汇总
            list1 = [house_title,house_type,house_aspect,house_district,house_business_area,resblock_name,house_area,house_aspect_2,
                    house_bedroom_nums,house_living_room_nums,house_restroom_nums,house_floor,house_label,house_brand,house_price,house_url]
            house_info.append(list1)
            #防止数据丢失
        except (IndexError,AttributeError,AttributeError,AttributeError,AttributeError):
            pass
    return house_info

获取数据

#Count是用来计算循环页数
count = 0
#house_all_info是用来过渡采集数据
house_all_info = []
#在商圈内循环
for single_business_area in business_area:
    i = 1
    #bug_times 用来尽量在同商圈内使得采集数据更全,逻辑在下面
    bug_times = 1
    single_url = 'https://sz.zu.ke.com/zufang/{}/pg{}rt200600000001/#contentList'.format(single_business_area,i)
    while count<=8000:
        time.sleep(0.8)
        print('开始收集',single_url)
        single_responce = requests.get(single_url)
        single_soup = bs(single_responce.text,'lxml')
        house_all_info.append( house_info_2(single_soup))
        
        try:
            #网页上显示页数(用来判断)
            pages = int(single_soup.find('div',class_='content__pg').get("data-totalpage"))
            #网页上显示套数(用来判断)
            total_num = int(single_soup.find('span',class_='content__title--hl').text)
            #商圈中文名称(用来判断)
            chinese_sba = single_soup.find('p',class_='content__title').text.replace(' ','').split('\n')[2].replace('深圳','').replace('租房','')
        except (IndexError,AttributeError,AttributeError,AttributeError,AttributeError,KeyboardInterrupt,KeyboardInterrupt,KeyboardInterrupt):
            pass
        i += 1
        
        final_house_info = []
        for house_info in house_all_info:
            for house_single_info in house_info:
                final_house_info.append(house_single_info)
        test_df = pd.DataFrame(final_house_info).drop_duplicates()
        print(chinese_sba)
        print(len(test_df[test_df[4]==chinese_sba]))
        #先挨个页数爬取    
        if i <= pages:
            single_url = 'https://sz.zu.ke.com/zufang/{}/pg{}rt200600000001/#contentList'.format(single_business_area,i)
        #如果不够显示套数再在此商圈页面爬取最高不超过80页次
        elif len(test_df[test_df[4]==chinese_sba]) < (total_num) and bug_times <=80:
            single_url = 'https://sz.zu.ke.com/zufang/{}/pg{}rt200600000001/#contentList'.format(single_business_area,random.randint(1,pages))
            bug_times += 1
        else:
            break
        count += 1
        print('收集第{}页完毕'.format(count))

检验储存数据

检查

print(house_all_info[0][0])

列表清洗

final_house_info = []
for house_info in house_all_info:
    for house_single_info in house_info:
        final_house_info.append(house_single_info)
print(final_house_info[0])

储存

df = pd.DataFrame(final_house_info).drop_duplicates()
print(df.head(5))
df.to_excel('贝壳XXXX_full.xlsx')

快去试试吧~

  • 1
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

Eason DayDayUp

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值