python获取贝壳房源信息

使用python脚本获取贝壳房源信息会使我们可以对房价进行对比,快速筛选出对我们有效的房子,也便于我们了解最新的房价信息,使我们掌握一手房源信息,话不多说,直接上干货!!!!

1.导入所需库

import re
from selenium.webdriver.chrome.service import Service
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
import pandas as pd

2.打开谷歌浏览器到指定页面

driver = webdriver.Chrome(service=Service(r"D:\application\Chrome\chromedriver-win64\chromedriver.exe"))
driver.get('https://m.ke.com/jx/bangdan/hainingshi1/ibd3?source=ershou/liebiao/bangdan')
driver.maximize_window()
time.sleep(5)
df = pd.DataFrame()

3.使用driver方法模拟人为点击到我们需要的页面

driver.find_element(By.XPATH,f'//*[@id="root"]/div/div[2]/div[2]/div[2]/div/div/div[1]/div/div[{i}]/div/div[1]/a/img').click()
    time.sleep(3)
    # 执行JavaScript代码,进入新页面并下滑30%//*[@class='xiaoqu-page_basic-info']//p[text()="房屋用途"]
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight * 0.3);")
    time.sleep(3)
    driver.switch_to.window(driver.window_handles[-1])
    # 查看小区基本信息
    text_to_find = "查看全部信息"  # 要查找的页面文本
    time.sleep(2)
    xpath_expression = f"//*[contains(text(), '{text_to_find}')]"  # 构造 XPath 表达式
    element = driver.find_element(By.XPATH, xpath_expression)  # 通过 XPath 定位元素
    time.sleep(3)
    # 对定位到的元素执行操作
    element.click()
    time.sleep(5)
    # 小区名称
    name = driver.find_element(By.XPATH, '//*[@id="root"]/div/div[1]/div[2]/div[1]/span[1]').text.split()
    print(name)
    name_string = ' '.join(name)
    name_string = re.sub('[^\w\s\n\r\t]+', '-', name_string)
    time.sleep(1)
    # 小区地址
    address = driver.find_element(By.CLASS_NAME, 'map-address').text.split()
    print(address)
    address_string = ' '.join(address)
    address_string = re.sub('[^\w\s\n\r\t]+', '-', address_string)
    time.sleep(1)
    # 物业费
    property_fee1 = driver.find_element(By.XPATH, "//*[@class='item']//span[text()='建筑类型']/..").text.split()
    print(property_fee1)

4.使用正则方法获取页面文本元素

    try:
        property_fee = driver.find_element(By.XPATH,'//*[@id="root"]/div/div[1]/div[8]/div/p[15]').text.split()
        print(property_fee)
        if "物业费" in property_fee:
            property_feestring = ' '.join(property_fee)
            property_feestring = re.sub('[^\w\s\n\r\t]+', '-', property_feestring)
        else:
            property_feestring = ' '.join(" ")
            property_feestring = re.sub('[^\w\s\n\r\t]+', '-', property_feestring)
        time.sleep(1)
    except:
        try:
            property_fee = driver.find_element(By.XPATH, '//*[@id="root"]/div/div[1]/div[7]/div/p[15]').text.split()
            print(property_fee)
            property_feestring = ' '.join(property_fee)
            property_feestring = re.sub('[^\w\s\n\r\t]+', '-', property_feestring)
            time.sleep(1)
        except:
            try:
                property_fee = driver.find_element(By.XPATH, '//*[@id="root"]/div/div[1]/div[8]/div/p[14]').text.split()
                print(property_fee)
                property_feestring = ' '.join(property_fee)
                property_feestring = re.sub('[^\w\s\n\r\t]+', '-', property_feestring)
                time.sleep(1)
            except:
                try:
                    property_fee = driver.find_element(By.XPATH, '//*[@id="root"]/div/div[1]/div[8]/div/p[13]').text.split()
                    print(property_fee)
                    property_feestring = ' '.join(property_fee)
                    property_feestring = re.sub('[^\w\s\n\r\t]+', '-', property_feestring)
                    time.sleep(1)
                except:
                    try:
                        property_fee = driver.find_element(By.XPATH,'//*[@id="root"]/div/div[1]/div[6]/div/p[13]').text.split()
                        print(property_fee)
                        property_feestring = ' '.join(property_fee)
                        property_feestring = re.sub('[^\w\s\n\r\t]+', '-', property_feestring)
                        time.sleep(1)
                    except:
                        try:
                            property_fee = driver.find_element(By.XPATH,'//*[@id="root"]/div/div[1]/div[7]/div/p[14]').text.split()
                            print(property_fee)
                            property_feestring = ' '.join(property_fee)
                            property_feestring = re.sub('[^\w\s\n\r\t]+', '-', property_feestring)
                            time.sleep(1)
                        except:
                            property_fee = driver.find_element(By.XPATH,'//*[@id="root"]/div/div[1]/div[7]/div/p[13]').text.split()
                            print(property_fee)
                            property_feestring = ' '.join(property_fee)
                            property_feestring = re.sub('[^\w\s\n\r\t]+', '-', property_feestring)
                            time.sleep(1)
    # 物业公司
    try:
        property_company = driver.find_element(By.XPATH, '//*[@id="root"]/div/div[1]/div[8]/div/p[6]').text.split()
        print(property_company)
        property_company_string = ' '.join(property_company)
        property_company_string = re.sub('[^\w\s\n\r\t]+', '-', property_company_string)
        time.sleep(1)
    except:
        try:
            property_company = driver.find_element(By.XPATH, '//*[@id="root"]/div/div[1]/div[7]/div/p[6]').text.split()
            print(property_company)
            property_company_string = ' '.join(property_company)
            property_company_string = re.sub('[^\w\s\n\r\t]+', '-', property_company_string)
            time.sleep(1)
        except:
            property_company = driver.find_element(By.XPATH, '//*[@id="root"]/div/div[1]/div[6]/div/p[6]').text.split()
            print(property_company)
            property_company_string = ' '.join(property_company)
            property_company_string = re.sub('[^\w\s\n\r\t]+', '-', property_company_string)
            time.sleep(1)
    # 物业开发商
    try:
        developers = driver.find_element(By.XPATH, '//*[@id="root"]/div/div[1]/div[8]/div/p[4]').text.split()
        print(developers)
        developers_string = ' '.join(property_company)
        developers_string = re.sub('[^\w\s\n\r\t]+', '-', developers_string)
        time.sleep(1)
    except:
        try:
            developers = driver.find_element(By.XPATH, '//*[@id="root"]/div/div[1]/div[7]/div/p[4]').text.split()
            print(developers)
            developers_string = ' '.join(property_company)
            developers_string = re.sub('[^\w\s\n\r\t]+', '-', developers_string)
            time.sleep(1)
        except:
            developers = driver.find_element(By.XPATH, '//*[@id="root"]/div/div[1]/div[6]/div/p[4]').text.split()
            print(developers)
            developers_string = ' '.join(property_company)
            developers_string = re.sub('[^\w\s\n\r\t]+', '-', developers_string)
            time.sleep(1)
    # 房屋用途
    try:
        house = driver.find_element(By.XPATH, '//*[@id="root"]/div/div[1]/div[8]/div/p[2]').text.split()
        print(house)
        house_string = ' '.join(house)
        house_string = re.sub('[^\w\s\n\r\t]+', '-', house_string)
        time.sleep(1)
    except:
        try:
            house = driver.find_element(By.XPATH, '//*[@id="root"]/div/div[1]/div[7]/div/p[2]').text.split()
            print(house)
            house_string = ' '.join(house)
            house_string = re.sub('[^\w\s\n\r\t]+', '-', house_string)
            time.sleep(1)
        except:
            house = driver.find_element(By.XPATH, '//*[@id="root"]/div/div[1]/div[6]/div/p[2]').text.split()
            print(house)
            house_string = ' '.join(house)
            house_string = re.sub('[^\w\s\n\r\t]+', '-', house_string)
            time.sleep(1)
    # 建筑类型
    try:
        building_types = driver.find_element(By.XPATH,'//*[@id="root"]/div/div[1]/div[8]/div/p[3]').text.split()
        print(building_types)
        building_types_string = ' '.join(building_types)
        building_types_string = re.sub('[^\w\s\n\r\t]+', '-', building_types_string)
        time.sleep(1)
    except:
        try:
            building_types = driver.find_element(By.XPATH, '//*[@id="root"]/div/div[1]/div[7]/div/p[3]').text.split()
            print(building_types)
            building_types_string = ' '.join(building_types)
            building_types_string = re.sub('[^\w\s\n\r\t]+', '-', building_types_string)
            time.sleep(1)
        except:
            building_types = driver.find_element(By.XPATH, '//*[@id="root"]/div/div[1]/div[6]/div/p[3]').text.split()
            print(building_types)
            building_types_string = ' '.join(building_types)
            building_types_string = re.sub('[^\w\s\n\r\t]+', '-', building_types_string)
            time.sleep(1)
    # 交易权属
    try:
        Transaction_ownership = driver.find_element(By.XPATH, '//*[@id="root"]/div/div[1]/div[8]/div/p[5]').text.split()
        print(Transaction_ownership)
        Transaction_ownership_string = ' '.join(Transaction_ownership)
        Transaction_ownership_string = re.sub('[^\w\s\n\r\t]+', '-', Transaction_ownership_string)
        time.sleep(1)
    except:
        try:
            Transaction_ownership = driver.find_element(By.XPATH, '//*[@id="root"]/div/div[1]/div[7]/div/p[5]').text.split()
            print(Transaction_ownership)
            Transaction_ownership_string = ' '.join(Transaction_ownership)
            Transaction_ownership_string = re.sub('[^\w\s\n\r\t]+', '-', Transaction_ownership_string)
            time.sleep(1)
        except:
            Transaction_ownership = driver.find_element(By.XPATH,
                                                        '//*[@id="root"]/div/div[1]/div[6]/div/p[5]').text.split()
            print(Transaction_ownership)
            Transaction_ownership_string = ' '.join(Transaction_ownership)
            Transaction_ownership_string = re.sub('[^\w\s\n\r\t]+', '-', Transaction_ownership_string)
            time.sleep(1)
    # 用水类型
    try:
        Water_use_type = driver.find_element(By.XPATH, '//*[@id="root"]/div/div[1]/div[8]/div/p[8]').text.split()
        print(Water_use_type)
        Water_string = ' '.join(Water_use_type)
        Water_string = re.sub('[^\w\s\n\r\t]+', '-', Water_string)
        time.sleep(1)
    except:
        try:
            Water_use_type = driver.find_element(By.XPATH, '//*[@id="root"]/div/div[1]/div[7]/div/p[8]').text.split()
            print(Water_use_type)
            Water_string = ' '.join(Water_use_type)
            Water_string = re.sub('[^\w\s\n\r\t]+', '-', Water_string)
            time.sleep(1)
        except:
            Water_use_type = driver.find_element(By.XPATH, '//*[@id="root"]/div/div[1]/div[6]/div/p[8]').text.split()
            print(Water_use_type)
            Water_string = ' '.join(Water_use_type)
            Water_string = re.sub('[^\w\s\n\r\t]+', '-', Water_string)
            time.sleep(1)
    # 用电类型
    try:
        Dian_use_type = driver.find_element(By.XPATH, '//*[@id="root"]/div/div[1]/div[8]/div/p[9]').text.split()
        print(Dian_use_type)
        Dian_string = ' '.join(Dian_use_type)
        Dian_string = re.sub('[^\w\s\n\r\t]+', '-', Dian_string)
        time.sleep(1)
    except:
        try:
            Dian_use_type = driver.find_element(By.XPATH, '//*[@id="root"]/div/div[1]/div[7]/div/p[9]').text.split()
            print(Dian_use_type)
            Dian_string = ' '.join(Dian_use_type)
            Dian_string = re.sub('[^\w\s\n\r\t]+', '-', Dian_string)
            time.sleep(1)
        except:
            Dian_use_type = driver.find_element(By.XPATH, '//*[@id="root"]/div/div[1]/div[6]/div/p[9]').text.split()
            print(Dian_use_type)
            Dian_string = ' '.join(Dian_use_type)
            Dian_string = re.sub('[^\w\s\n\r\t]+', '-', Dian_string)
            time.sleep(1)
    # 固定车位数
    try:
        parking_space = driver.find_element(By.XPATH, '//*[@id="root"]/div/div[1]/div[8]/div/p[10]').text.split()
        print(parking_space)
        if "固定车位数" in property_fee:
            parking_space_string = ' '.join(parking_space)
            parking_space_string = re.sub('[^\w\s\n\r\t]+', '-', parking_space_string)
        else:
            parking_space_string = ' '.join(" ")
            parking_space_string = re.sub('[^\w\s\n\r\t]+', '-', parking_space_string)
        time.sleep(1)
    except:
        try:
            parking_space = driver.find_element(By.XPATH, '//*[@id="root"]/div/div[1]/div[7]/div/p[10]').text.split()
            print(parking_space)
            if "固定车位数" in parking_space:
                parking_space_string = ' '.join(parking_space)
                parking_space_string = re.sub('[^\w\s\n\r\t]+', '-', parking_space_string)
            else:
                parking_space_string = ' '.join(" ")
                parking_space_string = re.sub('[^\w\s\n\r\t]+', '-', parking_space_string)
            time.sleep(1)
        except:
            parking_space = driver.find_element(By.XPATH, '//*[@id="root"]/div/div[1]/div[6]/div/p[10]').text.split()
            print(parking_space)
            if "固定车位数" in property_fee:
                parking_space_string = ' '.join(parking_space)
                parking_space_string = re.sub('[^\w\s\n\r\t]+', '-', parking_space_string)
            else:
                parking_space_string = ' '.join(" ")
                parking_space_string = re.sub('[^\w\s\n\r\t]+', '-', parking_space_string)
            time.sleep(1)
    # 停车费用
    try:
        parking_fee = driver.find_element(By.XPATH, '//*[@id="root"]/div/div[1]/div[8]/div/p[11]').text.split()
        print(parking_fee)
        parking_fee_string = ' '.join(parking_fee)
        parking_fee_string = re.sub('[^\w\s\n\r\t]+', '-', parking_fee_string)
        time.sleep(1)
    except:
        try:
            parking_fee = driver.find_element(By.XPATH, '//*[@id="root"]/div/div[1]/div[7]/div/p[11]').text.split()
            print(parking_fee)
            parking_fee_string = ' '.join(parking_fee)
            parking_fee_string = re.sub('[^\w\s\n\r\t]+', '-', parking_fee_string)
            time.sleep(1)
        except:
            parking_fee = driver.find_element(By.XPATH, '//*[@id="root"]/div/div[1]/div[6]/div/p[11]').text.split()
            print(parking_fee)
            parking_fee_string = ' '.join(parking_fee)
            parking_fee_string = re.sub('[^\w\s\n\r\t]+', '-', parking_fee_string)
            time.sleep(1)
    # 燃气费用
    try:
        gas_fee = driver.find_element(By.XPATH, '//*[@id="root"]/div/div[1]/div[8]/div/p[12]').text.split()
        print(gas_fee)
        gas_fee_string = ' '.join(gas_fee)
        gas_fee_string = re.sub('[^\w\s\n\r\t]+', '-', gas_fee_string)
        time.sleep(1)
    except:
        try:
            gas_fee = driver.find_element(By.XPATH, '//*[@id="root"]/div/div[1]/div[7]/div/p[12]').text.split()
            print(gas_fee)
            gas_fee_string = ' '.join(gas_fee)
            gas_fee_string = re.sub('[^\w\s\n\r\t]+', '-', gas_fee_string)
            time.sleep(1)
        except:
            gas_fee = driver.find_element(By.XPATH, '//*[@id="root"]/div/div[1]/div[6]/div/p[12]').text.split()
            print(gas_fee)
            gas_fee_string = ' '.join(gas_fee)
            gas_fee_string = re.sub('[^\w\s\n\r\t]+', '-', gas_fee_string)
            time.sleep(1)

5.使用excel将爬出来的数据导入excel中

    temp_df = pd.DataFrame({'小区地址': [address_string],
                            '小区名称': [name_string],
                            '物业费': [property_feestring],
                            '物业公司': [property_company_string],
                            '物业开发商': [developers_string],
                            '房屋用途': [house_string],
                            '建筑类型': [building_types_string],
                            '交易权属': [Transaction_ownership_string],
                            '用水类型': [Water_string],
                            '用电类型': [Dian_string],
                            '固定车位数': [parking_space_string],
                            '停车费用': [parking_fee_string],
                            '燃气费用': [gas_fee_string],
                            '容积率': [plot_ratio_string],
                            '绿化率': [greening_rate_string],
                            '建房年代': [datatime_string]
                            })

    df = pd.concat([df, temp_df], ignore_index=True)
df.to_excel("C:/Users/Administrator/Desktop/小区信息.xlsx", index=False)
driver.quit()

6.完整代码

import re
from selenium.webdriver.chrome.service import Service
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
import pandas as pd
driver = webdriver.Chrome(service=Service(r"D:\application\Chrome\chromedriver-win64\chromedriver.exe"))
driver.get('https://m.ke.com/jx/bangdan/hainingshi1/ibd3?source=ershou/liebiao/bangdan')
driver.maximize_window()
time.sleep(5)
df = pd.DataFrame()
for i in range(1,10,2):
    print(i)
    driver.find_element(By.XPATH,f'//*[@id="root"]/div/div[2]/div[2]/div[2]/div/div/div[1]/div/div[{i}]/div/div[1]/a/img').click()
    time.sleep(3)
    # 执行JavaScript代码,进入新页面并下滑30%//*[@class='xiaoqu-page_basic-info']//p[text()="房屋用途"]
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight * 0.3);")
    time.sleep(3)
    driver.switch_to.window(driver.window_handles[-1])
    # 查看小区基本信息
    text_to_find = "查看全部信息"  # 要查找的页面文本
    time.sleep(2)
    xpath_expression = f"//*[contains(text(), '{text_to_find}')]"  # 构造 XPath 表达式
    element = driver.find_element(By.XPATH, xpath_expression)  # 通过 XPath 定位元素
    time.sleep(3)
    # 对定位到的元素执行操作
    element.click()
    time.sleep(5)
    # 小区名称
    name = driver.find_element(By.XPATH, '//*[@id="root"]/div/div[1]/div[2]/div[1]/span[1]').text.split()
    print(name)
    name_string = ' '.join(name)
    name_string = re.sub('[^\w\s\n\r\t]+', '-', name_string)
    time.sleep(1)
    # 小区地址
    address = driver.find_element(By.CLASS_NAME, 'map-address').text.split()
    print(address)
    address_string = ' '.join(address)
    address_string = re.sub('[^\w\s\n\r\t]+', '-', address_string)
    time.sleep(1)
    # 物业费
    property_fee1 = driver.find_element(By.XPATH, "//*[@class='item']//span[text()='建筑类型']/..").text.split()
    print(property_fee1)
    try:
        property_fee = driver.find_element(By.XPATH,'//*[@id="root"]/div/div[1]/div[8]/div/p[15]').text.split()
        print(property_fee)
        if "物业费" in property_fee:
            property_feestring = ' '.join(property_fee)
            property_feestring = re.sub('[^\w\s\n\r\t]+', '-', property_feestring)
        else:
            property_feestring = ' '.join(" ")
            property_feestring = re.sub('[^\w\s\n\r\t]+', '-', property_feestring)
        time.sleep(1)
    except:
        try:
            property_fee = driver.find_element(By.XPATH, '//*[@id="root"]/div/div[1]/div[7]/div/p[15]').text.split()
            print(property_fee)
            property_feestring = ' '.join(property_fee)
            property_feestring = re.sub('[^\w\s\n\r\t]+', '-', property_feestring)
            time.sleep(1)
        except:
            try:
                property_fee = driver.find_element(By.XPATH, '//*[@id="root"]/div/div[1]/div[8]/div/p[14]').text.split()
                print(property_fee)
                property_feestring = ' '.join(property_fee)
                property_feestring = re.sub('[^\w\s\n\r\t]+', '-', property_feestring)
                time.sleep(1)
            except:
                try:
                    property_fee = driver.find_element(By.XPATH, '//*[@id="root"]/div/div[1]/div[8]/div/p[13]').text.split()
                    print(property_fee)
                    property_feestring = ' '.join(property_fee)
                    property_feestring = re.sub('[^\w\s\n\r\t]+', '-', property_feestring)
                    time.sleep(1)
                except:
                    try:
                        property_fee = driver.find_element(By.XPATH,'//*[@id="root"]/div/div[1]/div[6]/div/p[13]').text.split()
                        print(property_fee)
                        property_feestring = ' '.join(property_fee)
                        property_feestring = re.sub('[^\w\s\n\r\t]+', '-', property_feestring)
                        time.sleep(1)
                    except:
                        try:
                            property_fee = driver.find_element(By.XPATH,'//*[@id="root"]/div/div[1]/div[7]/div/p[14]').text.split()
                            print(property_fee)
                            property_feestring = ' '.join(property_fee)
                            property_feestring = re.sub('[^\w\s\n\r\t]+', '-', property_feestring)
                            time.sleep(1)
                        except:
                            property_fee = driver.find_element(By.XPATH,'//*[@id="root"]/div/div[1]/div[7]/div/p[13]').text.split()
                            print(property_fee)
                            property_feestring = ' '.join(property_fee)
                            property_feestring = re.sub('[^\w\s\n\r\t]+', '-', property_feestring)
                            time.sleep(1)
    # 物业公司
    try:
        property_company = driver.find_element(By.XPATH, '//*[@id="root"]/div/div[1]/div[8]/div/p[6]').text.split()
        print(property_company)
        property_company_string = ' '.join(property_company)
        property_company_string = re.sub('[^\w\s\n\r\t]+', '-', property_company_string)
        time.sleep(1)
    except:
        try:
            property_company = driver.find_element(By.XPATH, '//*[@id="root"]/div/div[1]/div[7]/div/p[6]').text.split()
            print(property_company)
            property_company_string = ' '.join(property_company)
            property_company_string = re.sub('[^\w\s\n\r\t]+', '-', property_company_string)
            time.sleep(1)
        except:
            property_company = driver.find_element(By.XPATH, '//*[@id="root"]/div/div[1]/div[6]/div/p[6]').text.split()
            print(property_company)
            property_company_string = ' '.join(property_company)
            property_company_string = re.sub('[^\w\s\n\r\t]+', '-', property_company_string)
            time.sleep(1)
    # 物业开发商
    try:
        developers = driver.find_element(By.XPATH, '//*[@id="root"]/div/div[1]/div[8]/div/p[4]').text.split()
        print(developers)
        developers_string = ' '.join(property_company)
        developers_string = re.sub('[^\w\s\n\r\t]+', '-', developers_string)
        time.sleep(1)
    except:
        try:
            developers = driver.find_element(By.XPATH, '//*[@id="root"]/div/div[1]/div[7]/div/p[4]').text.split()
            print(developers)
            developers_string = ' '.join(property_company)
            developers_string = re.sub('[^\w\s\n\r\t]+', '-', developers_string)
            time.sleep(1)
        except:
            developers = driver.find_element(By.XPATH, '//*[@id="root"]/div/div[1]/div[6]/div/p[4]').text.split()
            print(developers)
            developers_string = ' '.join(property_company)
            developers_string = re.sub('[^\w\s\n\r\t]+', '-', developers_string)
            time.sleep(1)
    # 房屋用途
    try:
        house = driver.find_element(By.XPATH, '//*[@id="root"]/div/div[1]/div[8]/div/p[2]').text.split()
        print(house)
        house_string = ' '.join(house)
        house_string = re.sub('[^\w\s\n\r\t]+', '-', house_string)
        time.sleep(1)
    except:
        try:
            house = driver.find_element(By.XPATH, '//*[@id="root"]/div/div[1]/div[7]/div/p[2]').text.split()
            print(house)
            house_string = ' '.join(house)
            house_string = re.sub('[^\w\s\n\r\t]+', '-', house_string)
            time.sleep(1)
        except:
            house = driver.find_element(By.XPATH, '//*[@id="root"]/div/div[1]/div[6]/div/p[2]').text.split()
            print(house)
            house_string = ' '.join(house)
            house_string = re.sub('[^\w\s\n\r\t]+', '-', house_string)
            time.sleep(1)
    # 建筑类型
    try:
        building_types = driver.find_element(By.XPATH,'//*[@id="root"]/div/div[1]/div[8]/div/p[3]').text.split()
        print(building_types)
        building_types_string = ' '.join(building_types)
        building_types_string = re.sub('[^\w\s\n\r\t]+', '-', building_types_string)
        time.sleep(1)
    except:
        try:
            building_types = driver.find_element(By.XPATH, '//*[@id="root"]/div/div[1]/div[7]/div/p[3]').text.split()
            print(building_types)
            building_types_string = ' '.join(building_types)
            building_types_string = re.sub('[^\w\s\n\r\t]+', '-', building_types_string)
            time.sleep(1)
        except:
            building_types = driver.find_element(By.XPATH, '//*[@id="root"]/div/div[1]/div[6]/div/p[3]').text.split()
            print(building_types)
            building_types_string = ' '.join(building_types)
            building_types_string = re.sub('[^\w\s\n\r\t]+', '-', building_types_string)
            time.sleep(1)
    # 交易权属
    try:
        Transaction_ownership = driver.find_element(By.XPATH, '//*[@id="root"]/div/div[1]/div[8]/div/p[5]').text.split()
        print(Transaction_ownership)
        Transaction_ownership_string = ' '.join(Transaction_ownership)
        Transaction_ownership_string = re.sub('[^\w\s\n\r\t]+', '-', Transaction_ownership_string)
        time.sleep(1)
    except:
        try:
            Transaction_ownership = driver.find_element(By.XPATH, '//*[@id="root"]/div/div[1]/div[7]/div/p[5]').text.split()
            print(Transaction_ownership)
            Transaction_ownership_string = ' '.join(Transaction_ownership)
            Transaction_ownership_string = re.sub('[^\w\s\n\r\t]+', '-', Transaction_ownership_string)
            time.sleep(1)
        except:
            Transaction_ownership = driver.find_element(By.XPATH,
                                                        '//*[@id="root"]/div/div[1]/div[6]/div/p[5]').text.split()
            print(Transaction_ownership)
            Transaction_ownership_string = ' '.join(Transaction_ownership)
            Transaction_ownership_string = re.sub('[^\w\s\n\r\t]+', '-', Transaction_ownership_string)
            time.sleep(1)
    # 用水类型
    try:
        Water_use_type = driver.find_element(By.XPATH, '//*[@id="root"]/div/div[1]/div[8]/div/p[8]').text.split()
        print(Water_use_type)
        Water_string = ' '.join(Water_use_type)
        Water_string = re.sub('[^\w\s\n\r\t]+', '-', Water_string)
        time.sleep(1)
    except:
        try:
            Water_use_type = driver.find_element(By.XPATH, '//*[@id="root"]/div/div[1]/div[7]/div/p[8]').text.split()
            print(Water_use_type)
            Water_string = ' '.join(Water_use_type)
            Water_string = re.sub('[^\w\s\n\r\t]+', '-', Water_string)
            time.sleep(1)
        except:
            Water_use_type = driver.find_element(By.XPATH, '//*[@id="root"]/div/div[1]/div[6]/div/p[8]').text.split()
            print(Water_use_type)
            Water_string = ' '.join(Water_use_type)
            Water_string = re.sub('[^\w\s\n\r\t]+', '-', Water_string)
            time.sleep(1)
    # 用电类型
    try:
        Dian_use_type = driver.find_element(By.XPATH, '//*[@id="root"]/div/div[1]/div[8]/div/p[9]').text.split()
        print(Dian_use_type)
        Dian_string = ' '.join(Dian_use_type)
        Dian_string = re.sub('[^\w\s\n\r\t]+', '-', Dian_string)
        time.sleep(1)
    except:
        try:
            Dian_use_type = driver.find_element(By.XPATH, '//*[@id="root"]/div/div[1]/div[7]/div/p[9]').text.split()
            print(Dian_use_type)
            Dian_string = ' '.join(Dian_use_type)
            Dian_string = re.sub('[^\w\s\n\r\t]+', '-', Dian_string)
            time.sleep(1)
        except:
            Dian_use_type = driver.find_element(By.XPATH, '//*[@id="root"]/div/div[1]/div[6]/div/p[9]').text.split()
            print(Dian_use_type)
            Dian_string = ' '.join(Dian_use_type)
            Dian_string = re.sub('[^\w\s\n\r\t]+', '-', Dian_string)
            time.sleep(1)
    # 固定车位数
    try:
        parking_space = driver.find_element(By.XPATH, '//*[@id="root"]/div/div[1]/div[8]/div/p[10]').text.split()
        print(parking_space)
        if "固定车位数" in property_fee:
            parking_space_string = ' '.join(parking_space)
            parking_space_string = re.sub('[^\w\s\n\r\t]+', '-', parking_space_string)
        else:
            parking_space_string = ' '.join(" ")
            parking_space_string = re.sub('[^\w\s\n\r\t]+', '-', parking_space_string)
        time.sleep(1)
    except:
        try:
            parking_space = driver.find_element(By.XPATH, '//*[@id="root"]/div/div[1]/div[7]/div/p[10]').text.split()
            print(parking_space)
            if "固定车位数" in parking_space:
                parking_space_string = ' '.join(parking_space)
                parking_space_string = re.sub('[^\w\s\n\r\t]+', '-', parking_space_string)
            else:
                parking_space_string = ' '.join(" ")
                parking_space_string = re.sub('[^\w\s\n\r\t]+', '-', parking_space_string)
            time.sleep(1)
        except:
            parking_space = driver.find_element(By.XPATH, '//*[@id="root"]/div/div[1]/div[6]/div/p[10]').text.split()
            print(parking_space)
            if "固定车位数" in property_fee:
                parking_space_string = ' '.join(parking_space)
                parking_space_string = re.sub('[^\w\s\n\r\t]+', '-', parking_space_string)
            else:
                parking_space_string = ' '.join(" ")
                parking_space_string = re.sub('[^\w\s\n\r\t]+', '-', parking_space_string)
            time.sleep(1)
    # 停车费用
    try:
        parking_fee = driver.find_element(By.XPATH, '//*[@id="root"]/div/div[1]/div[8]/div/p[11]').text.split()
        print(parking_fee)
        parking_fee_string = ' '.join(parking_fee)
        parking_fee_string = re.sub('[^\w\s\n\r\t]+', '-', parking_fee_string)
        time.sleep(1)
    except:
        try:
            parking_fee = driver.find_element(By.XPATH, '//*[@id="root"]/div/div[1]/div[7]/div/p[11]').text.split()
            print(parking_fee)
            parking_fee_string = ' '.join(parking_fee)
            parking_fee_string = re.sub('[^\w\s\n\r\t]+', '-', parking_fee_string)
            time.sleep(1)
        except:
            parking_fee = driver.find_element(By.XPATH, '//*[@id="root"]/div/div[1]/div[6]/div/p[11]').text.split()
            print(parking_fee)
            parking_fee_string = ' '.join(parking_fee)
            parking_fee_string = re.sub('[^\w\s\n\r\t]+', '-', parking_fee_string)
            time.sleep(1)
    # 燃气费用
    try:
        gas_fee = driver.find_element(By.XPATH, '//*[@id="root"]/div/div[1]/div[8]/div/p[12]').text.split()
        print(gas_fee)
        gas_fee_string = ' '.join(gas_fee)
        gas_fee_string = re.sub('[^\w\s\n\r\t]+', '-', gas_fee_string)
        time.sleep(1)
    except:
        try:
            gas_fee = driver.find_element(By.XPATH, '//*[@id="root"]/div/div[1]/div[7]/div/p[12]').text.split()
            print(gas_fee)
            gas_fee_string = ' '.join(gas_fee)
            gas_fee_string = re.sub('[^\w\s\n\r\t]+', '-', gas_fee_string)
            time.sleep(1)
        except:
            gas_fee = driver.find_element(By.XPATH, '//*[@id="root"]/div/div[1]/div[6]/div/p[12]').text.split()
            print(gas_fee)
            gas_fee_string = ' '.join(gas_fee)
            gas_fee_string = re.sub('[^\w\s\n\r\t]+', '-', gas_fee_string)
            time.sleep(1)
    # 容积率
    try:
        plot_ratio = driver.find_element(By.XPATH, '//*[@id="root"]/div/div[1]/div[8]/div/p[13]').text.split()
        print(plot_ratio)
        plot_ratio_string = ' '.join(plot_ratio)
        plot_ratio_string = re.sub('[^\w\s\n\r\t]+', '-', plot_ratio_string)
        time.sleep(1)
    except:
        try:
            plot_ratio = driver.find_element(By.XPATH, '//*[@id="root"]/div/div[1]/div[7]/div/p[13]').text.split()
            print(plot_ratio)
            plot_ratio_string = ' '.join(plot_ratio)
            plot_ratio_string = re.sub('[^\w\s\n\r\t]+', '-', plot_ratio_string)
            time.sleep(1)
        except:
            plot_ratio = driver.find_element(By.XPATH, '//*[@id="root"]/div/div[1]/div[6]/div/p[13]').text.split()
            print(plot_ratio)
            plot_ratio_string = ' '.join(plot_ratio)
            plot_ratio_string = re.sub('[^\w\s\n\r\t]+', '-', plot_ratio_string)
            time.sleep(1)
    # 绿化率
    try:
        greening_rate = driver.find_element(By.XPATH, '//*[@id="root"]/div/div[1]/div[8]/div/p[14]').text.split()
        print(greening_rate)
        greening_rate_string = ' '.join(greening_rate)
        greening_rate_string = re.sub('[^\w\s\n\r\t]+', '-', greening_rate_string)
        time.sleep(1)
    except:
        try:
            greening_rate = driver.find_element(By.XPATH, '//*[@id="root"]/div/div[1]/div[7]/div/p[14]').text.split()
            print(greening_rate)
            greening_rate_string = ' '.join(greening_rate)
            greening_rate_string = re.sub('[^\w\s\n\r\t]+', '-', greening_rate_string)
            time.sleep(1)
        except:
            try:
                greening_rate = driver.find_element(By.XPATH, '//*[@id="root"]/div/div[1]/div[6]/div/p[14]').text.split()
                print(greening_rate)
                greening_rate_string = ' '.join(greening_rate)
                greening_rate_string = re.sub('[^\w\s\n\r\t]+', '-', greening_rate_string)
                time.sleep(1)
            except:
                print('没有下标为14的数据')

    # 建房年代
    try:
        datatime = driver.find_element(By.XPATH, '//*[@id="root"]/div/div[1]/div[8]/div/p[1]').text.split()
        print(datatime)
        datatime_string = ' '.join(datatime)
        datatime_string = re.sub('[^\w\s\n\r\t]+', '-', datatime_string)
        driver.back()
        time.sleep(1)
    except:
        try:
            datatime = driver.find_element(By.XPATH, '//*[@id="root"]/div/div[1]/div[7]/div/p[1]').text.split()
            print(datatime)
            datatime_string = ' '.join(datatime)
            datatime_string = re.sub('[^\w\s\n\r\t]+', '-', datatime_string)
            driver.back()
            time.sleep(1)
        except:
            datatime = driver.find_element(By.XPATH, '//*[@id="root"]/div/div[1]/div[6]/div/p[1]').text.split()
            print(datatime)
            datatime_string = ' '.join(datatime)
            datatime_string = re.sub('[^\w\s\n\r\t]+', '-', datatime_string)
            driver.back()
            time.sleep(1)

    temp_df = pd.DataFrame({'小区地址': [address_string],
                            '小区名称': [name_string],
                            '物业费': [property_feestring],
                            '物业公司': [property_company_string],
                            '物业开发商': [developers_string],
                            '房屋用途': [house_string],
                            '建筑类型': [building_types_string],
                            '交易权属': [Transaction_ownership_string],
                            '用水类型': [Water_string],
                            '用电类型': [Dian_string],
                            '固定车位数': [parking_space_string],
                            '停车费用': [parking_fee_string],
                            '燃气费用': [gas_fee_string],
                            '容积率': [plot_ratio_string],
                            '绿化率': [greening_rate_string],
                            '建房年代': [datatime_string]
                            })

    df = pd.concat([df, temp_df], ignore_index=True)
df.to_excel("C:/Users/Administrator/Desktop/小区信息.xlsx", index=False)
driver.quit()
使用正则表达式获取页面元素可以使我们获取出来的数据内容更加准确,不管页面如何变化,都不会对我们的数据造成影响,否则页面只要发生变化我们的代码就需要跟着变化。
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值