爬取贝壳租房的数据

最新推荐文章于 2025-03-09 22:13:06 发布

FXGBG

最新推荐文章于 2025-03-09 22:13:06 发布

阅读量726

点赞数

分类专栏：学习文章标签： python

本文链接：https://blog.csdn.net/FXGBG/article/details/129865732

版权

学习专栏收录该内容

59 篇文章

订阅专栏

爬取贝壳租房的数据

from csv import writer
from re import fullmatch

import os
import requests
from bs4 import BeautifulSoup


def get_url(start_page=1, end_page=2):
    # 如果'file'文件不存在就创建
    if not os.path.exists(r'file'):
        os.mkdir(r'file')

    # 打开'file'下的'house.csv'文件
    f = open(r'file/house.csv', 'w', encoding='utf-8', newline='')
    w1 = writer(f)
    # 存入第一行
    w1.writerow(['名字', '地址', '面积', '样式', '价格'])

    # 设置起始页和终止页并遍历
    for page in range(start_page, end_page):

        url = rf'https://cd.zu.ke.com/zufang/pg{page}/#contentList'

        headers = {
            'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36'
        }

        response = requests.get(url, headers=headers)
        result = response.text

        # 创建soup对象
        soup = BeautifulSoup(result, 'lxml')
        
        # 数据筛选
        div_house = soup.select('.content__article>div>div')
        
        for x in div_house:
            # 提取名字
            name = x.select_one('.twoline').text.strip()
            
            # 提取p标签里的内容
            msg = x.select('div>p')
            
            # 对提取到的内容进行处理，获取地址add，面积area，样式pattern
            msg_list = [y.text.strip().split('\n') for y in msg]
            if fullmatch(r'\w+\s+/', msg_list[1][0]):
                add = msg_list[1][1]
                area = msg_list[1][3].strip()
                pattern = msg_list[1][5].strip()
            elif fullmatch(r'\w+-\w+-\w+', msg_list[1][0]):
                add = msg_list[1][0]
                area = msg_list[1][2].strip()
                pattern = msg_list[1][4].strip()
            else:
                continue
                
            # 获取价格
            price = x.select_one('.content__list--item-price').text
            
            # 将数据写入csv文件
            w1.writerow([name, add, area, pattern, price])
            
        print(f'第{page}页加载完成')
        
    # 关闭文件
    f.close()


if __name__ == '__main__':
    get_url(1, 101)