安居客_义乌二手房房源爬取

冷雪言

于 2024-08-18 14:22:43 发布

阅读量282

点赞数 11

分类专栏： python 爬虫 selenium 文章标签： python 开发语言

本文链接：https://blog.csdn.net/m0_72057247/article/details/141299443

版权

python 同时被 3 个专栏收录

2 篇文章 0 订阅

订阅专栏

爬虫

1 篇文章 0 订阅

订阅专栏

selenium

1 篇文章 0 订阅

订阅专栏

爬取安居客上义乌二手房房源的信息

解释：

因为我的Chrome浏览器驱动是直接放在python的主目录下的，所以在创建webdriver对象时并没有直接指定Chrome浏览器驱动的路径。

具体代码如下：

from selenium import webdriver
import time
from selenium.webdriver.common.by import By
import pandas as pd

wd = webdriver.Chrome()
wd.maximize_window()

# 使用pandas构建DataFrame，表示最后生成数据的表结构
houses = pd.DataFrame(
    columns=['所属小区', '所属区域', '户型', '建面', '朝向', '楼层', '装修', '年份', '电梯相关', '地铁相关', '单价', '价格'])

# 循环构建1-32页的url
for j in range(1, 33):
    print(f"==========================正在爬取第{j}页================================")
    url_link = f"https://yiwu.anjuke.com/sale/p{j}/?from=HomePage_TopBar"
    wd.get(url_link)
    time.sleep(2)

    # 找到链接的集合
    lists = wd.find_element(By.CSS_SELECTOR, "section.list-main > section.list-left > section.list").find_elements(By.CLASS_NAME, "property")
    i = 0  # 初始化索引
    while i < len(lists):
        item = lists[i]  # 获取当前房屋
        url = item.find_element(By.CSS_SELECTOR, "a").get_attribute("href")
        print(url)

        # 初始化一个空字典，用于存储所有房屋信息
        all_house_data = {}

        # 获取每个url对应的详细信息
        wd.get(url)  # 跳转到每个url对应的页面
        time.sleep(2)

        # 所属小区
        house_community = wd.find_element(By.CLASS_NAME, "maininfo-community").find_elements(By.CLASS_NAME, "maininfo-community-item")[0].find_elements(By.CSS_SELECTOR, "a")[0].text
        # print(house_community)

        # 所属区域
        house_area = wd.find_element(By.CLASS_NAME, "maininfo-community").find_elements(By.CLASS_NAME, "maininfo-community-item")[1].find_element(By.CLASS_NAME, "maininfo-community-item-name").text
        # print(house_area)

        # 户型
        house_type = wd.find_element(By.CLASS_NAME, "maininfo-model-item.maininfo-model-item-1").find_element(By.CLASS_NAME, "maininfo-model-strong").text
        # print(house_type)

        # 建面
        house_acreage = wd.find_element(By.CLASS_NAME, "maininfo-model-item.maininfo-model-item-2").find_element(By.CLASS_NAME, "maininfo-model-strong").text
        # print(house_acreage)

        # 朝向
        house_direction = wd.find_element(By.CLASS_NAME, "maininfo-model-item.maininfo-model-item-3").find_element(By.CLASS_NAME, "maininfo-model-strong-text").text
        # print(house_direction)

        # 楼层
        house_floor = wd.find_element(By.CLASS_NAME, "maininfo-model-item.maininfo-model-item-1").find_element(By.CLASS_NAME, "maininfo-model-weak").text
        # print(house_floor)

        # 装修
        house_fitment = wd.find_element(By.CLASS_NAME, "maininfo-model-item.maininfo-model-item-2").find_element(By.CLASS_NAME, "maininfo-model-weak").text
        # print(house_fitment)

        # 年份
        house_year = wd.find_element(By.CLASS_NAME, "maininfo-model-item.maininfo-model-item-3").find_element(By.CLASS_NAME, "maininfo-model-weak").text
        # print(house_year)

        # 遍历标签集合
        list_tags = wd.find_element(By.CLASS_NAME, "maininfo-tags").find_elements(By.CLASS_NAME, "maininfo-tags-item")
        length_list_tags = len(list_tags)

        elevator_correlation = 0
        subway_correlation = 0

        if length_list_tags > 0:
            for item in list_tags:
                if item.text == "有电梯":
                    elevator_correlation = item.text
                elif item.text == "近地铁":
                    subway_correlation = item.text

        print(elevator_correlation)
        print(subway_correlation)

        # 单价
        unit_price = wd.find_element(By.CLASS_NAME, "maininfo-avgprice-price").text
        print(unit_price)

        # 价格
        total_price = wd.find_element(By.CLASS_NAME, "maininfo-price-wrap").find_element(By.CLASS_NAME, "maininfo-price-num").text
        print(total_price)

        # 将获取到的数据放入到字典中
        all_house_data["所属小区"] = house_community
        all_house_data["所属区域"] = house_area
        all_house_data["户型"] = house_type
        all_house_data["建面"] = house_acreage
        all_house_data["朝向"] = house_direction
        all_house_data["楼层"] = house_floor
        all_house_data["装修"] = house_fitment
        all_house_data["年份"] = house_year
        all_house_data["电梯相关"] = elevator_correlation
        all_house_data["地铁相关"] = subway_correlation
        all_house_data["单价"] = unit_price
        all_house_data["价格"] = total_price

        # 将每一条封好的数据添加到DataFrame中
        houses = houses._append(all_house_data, ignore_index=True)
        print(all_house_data)

        # 返回上一页
        wd.back()
        time.sleep(2)

        # 重新获取链接集合
        lists = wd.find_element(By.CSS_SELECTOR, "section.list-main > section.list-left > section.list").find_elements(By.CLASS_NAME, "property")
        i += 1  # 递增索引

# 写入 Excel 文件
excel_file_path = "old_house_data.xlsx"  # 您可以更改此路径
houses.to_excel(excel_file_path, index=False)