使用python脚本获取贝壳房源信息会使我们可以对房价进行对比,快速筛选出对我们有效的房子,也便于我们了解最新的房价信息,使我们掌握一手房源信息,话不多说,直接上干货!!!!
1.导入所需库
import re
from selenium.webdriver.chrome.service import Service
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
import pandas as pd
2.打开谷歌浏览器到指定页面
driver = webdriver.Chrome(service=Service(r"D:\application\Chrome\chromedriver-win64\chromedriver.exe"))
driver.get('https://m.ke.com/jx/bangdan/hainingshi1/ibd3?source=ershou/liebiao/bangdan')
driver.maximize_window()
time.sleep(5)
df = pd.DataFrame()
3.使用driver方法模拟人为点击到我们需要的页面
driver.find_element(By.XPATH,f'//*[@id="root"]/div/div[2]/div[2]/div[2]/div/div/div[1]/div/div[{i}]/div/div[1]/a/img').click()
time.sleep(3)
# 执行JavaScript代码,进入新页面并下滑30%//*[@class='xiaoqu-page_basic-info']//p[text()="房屋用途"]
driver.execute_script("window.scrollTo(0, document.body.scrollHeight * 0.3);")
time.sleep(3)
driver.switch_to.window(driver.window_handles[-1])
# 查看小区基本信息
text_to_find = "查看全部信息" # 要查找的页面文本
time.sleep(2)
xpath_expression = f"//*[contains(text(), '{text_to_find}')]" # 构造 XPath 表达式
element = driver.find_element(By.XPATH, xpath_expression) # 通过 XPath 定位元素
time.sleep(3)
# 对定位到的元素执行操作
element.click()
time.sleep(5)
# 小区名称
name = driver.find_element(By.XPATH, '//*[@id="root"]/div/div[1]/div[2]/div[1]/span[1]').text.split()
print(name)
name_string = ' '.join(name)
name_string = re.sub('[^\w\s\n\r\t]+', '-', name_string)
time.sleep(1)
# 小区地址
address = driver.find_element(By.CLASS_NAME, 'map-address').text.split()
print(address)
address_string = ' '.join(address)
address_string = re.sub('[^\w\s\n\r\t]+', '-', address_string)
time.sleep(1)
# 物业费
property_fee1 = driver.find_element(By.XPATH, "//*[@class='item']//span[text()='建筑类型']/..").text.split()
print(property_fee1)
4.使用正则方法获取页面文本元素
try:
property_fee = driver.find_element(By.XPATH,'//*[@id="root"]/div/div[1]/div[8]/div/p[15]').text.split()
print(property_fee)
if "物业费" in property_fee:
property_feestring = ' '.join(property_fee)
property_feestring = re.sub('[^\w\s\n\r\t]+', '-', property_feestring)
else:
property_feestring = ' '.join(" ")
property_feestring = re.sub('[^\w\s\n\r\t]+', '-', property_feestring)
time.sleep(1)
except:
try:
property_fee = driver.find_element(By.XPATH, '//*[@id="root"]/div/div[1]/div[7]/div/p[15]').text.split()
print(property_fee)
property_feestring = ' '.join(property_fee)
property_feestring = re.sub('[^\w\s\n\r\t]+', '-', property_feestring)
time.sleep(1)
except:
try:
property_fee = driver.find_element(By.XPATH, '//*[@id="root"]/div/div[1]/div[8]/div/p[14]').text.split()
print(property_fee)
property_feestring = ' '.join(property_fee)
property_feestring = re.sub('[^\w\s\n\r\t]+', '-', property_feestring)
time.sleep(1)
except:
try:
property_fee = driver.find_element(By.XPATH, '//*[@id="root"]/div/div[1]/div[8]/div/p[13]').text.split()
print(property_fee)
property_feestring = ' '.join(property_fee)
property_feestring = re.sub('[^\w\s\n\r\t]+', '-', property_feestring)
time.sleep(1)
except:
try:
property_fee = driver.find_element(By.XPATH,'//*[@id="root"]/div/div[1]/div[6]/div/p[13]').text.split()
print(property_fee)
property_feestring = ' '.join(property_fee)
property_feestring = re.sub('[^\w\s\n\r\t]+', '-', property_feestring)
time.sleep(1)
except:
try:
property_fee = driver.find_element(By.XPATH,'//*[@id="root"]/div/div[1]/div[7]/div/p[14]').text.split()
print(property_fee)
property_feestring = ' '.join(property_fee)
property_feestring = re.sub('[^\w\s\n\r\t]+', '-', property_feestring)
time.sleep(1)
except:
property_fee = driver.find_element(By.XPATH,'//*[@id="root"]/div/div[1]/div[7]/div/p[13]').text.split()
print(property_fee)
property_feestring = ' '.join(property_fee)
property_feestring = re.sub('[^\w\s\n\r\t]+', '-', property_feestring)
time.sleep(1)
# 物业公司
try:
property_company = driver.find_element(By.XPATH, '//*[@id="root"]/div/div[1]/div[8]/div/p[6]').text.split()
print(property_company)
property_company_string = ' '.join(property_company)
property_company_string = re.sub('[^\w\s\n\r\t]+', '-', property_company_string)
time.sleep(1)
except:
try:
property_company = driver.find_element(By.XPATH, '//*[@id="root"]/div/div[1]/div[7]/div/p[6]').text.split()
print(property_company)
property_company_string = ' '.join(property_company)
property_company_string = re.sub('[^\w\s\n\r\t]+', '-', property_company_string)
time.sleep(1)
except:
property_company = driver.find_element(By.XPATH, '//*[@id="root"]/div/div[1]/div[6]/div/p[6]').text.split()
print(property_company)
property_company_string = ' '.join(property_company)
property_company_string = re.sub('[^\w\s\n\r\t]+', '-', property_company_string)
time.sleep(1)
# 物业开发商
try:
developers = driver.find_element(By.XPATH, '//*[@id="root"]/div/div[1]/div[8]/div/p[4]').text.split()
print(developers)
developers_string = ' '.join(property_company)
developers_string = re.sub('[^\w\s\n\r\t]+', '-', developers_string)
time.sleep(1)
except:
try:
developers = driver.find_element(By.XPATH, '//*[@id="root"]/div/div[1]/div[7]/div/p[4]').text.split()
print(developers)
developers_string = ' '.join(property_company)
developers_string = re.sub('[^\w\s\n\r\t]+', '-', developers_string)
time.sleep(1)
except:
developers = driver.find_element(By.XPATH, '//*[@id="root"]/div/div[1]/div[6]/div/p[4]').text.split()
print(developers)
developers_string = ' '.join(property_company)
developers_string = re.sub('[^\w\s\n\r\t]+', '-', developers_string)
time.sleep(1)
# 房屋用途
try:
house = driver.find_element(By.XPATH, '//*[@id="root"]/div/div[1]/div[8]/div/p[2]').text.split()
print(house)
house_string = ' '.join(house)
house_string = re.sub('[^\w\s\n\r\t]+', '-', house_string)
time.sleep(1)
except:
try:
house = driver.find_element(By.XPATH, '//*[@id="root"]/div/div[1]/div[7]/div/p[2]').text.split()
print(house)
house_string = ' '.join(house)
house_string = re.sub('[^\w\s\n\r\t]+', '-', house_string)
time.sleep(1)
except:
house = driver.find_element(By.XPATH, '//*[@id="root"]/div/div[1]/div[6]/div/p[2]').text.split()
print(house)
house_string = ' '.join(house)
house_string = re.sub('[^\w\s\n\r\t]+', '-', house_string)
time.sleep(1)
# 建筑类型
try:
building_types = driver.find_element(By.XPATH,'//*[@id="root"]/div/div[1]/div[8]/div/p[3]').text.split()
print(building_types)
building_types_string = ' '.join(building_types)
building_types_string = re.sub('[^\w\s\n\r\t]+', '-', building_types_string)
time.sleep(1)
except:
try:
building_types = driver.find_element(By.XPATH, '//*[@id="root"]/div/div[1]/div[7]/div/p[3]').text.split()
print(building_types)
building_types_string = ' '.join(building_types)
building_types_string = re.sub('[^\w\s\n\r\t]+', '-', building_types_string)
time.sleep(1)
except:
building_types = driver.find_element(By.XPATH, '//*[@id="root"]/div/div[1]/div[6]/div/p[3]').text.split()
print(building_types)
building_types_string = ' '.join(building_types)
building_types_string = re.sub('[^\w\s\n\r\t]+', '-', building_types_string)
time.sleep(1)
# 交易权属
try:
Transaction_ownership = driver.find_element(By.XPATH, '//*[@id="root"]/div/div[1]/div[8]/div/p[5]').text.split()
print(Transaction_ownership)
Transaction_ownership_string = ' '.join(Transaction_ownership)
Transaction_ownership_string = re.sub('[^\w\s\n\r\t]+', '-', Transaction_ownership_string)
time.sleep(1)
except:
try:
Transaction_ownership = driver.find_element(By.XPATH, '//*[@id="root"]/div/div[1]/div[7]/div/p[5]').text.split()
print(Transaction_ownership)
Transaction_ownership_string = ' '.join(Transaction_ownership)
Transaction_ownership_string = re.sub('[^\w\s\n\r\t]+', '-', Transaction_ownership_string)
time.sleep(1)
except:
Transaction_ownership = driver.find_element(By.XPATH,
'//*[@id="root"]/div/div[1]/div[6]/div/p[5]').text.split()
print(Transaction_ownership)
Transaction_ownership_string = ' '.join(Transaction_ownership)
Transaction_ownership_string = re.sub('[^\w\s\n\r\t]+', '-', Transaction_ownership_string)
time.sleep(1)
# 用水类型
try:
Water_use_type = driver.find_element(By.XPATH, '//*[@id="root"]/div/div[1]/div[8]/div/p[8]').text.split()
print(Water_use_type)
Water_string = ' '.join(Water_use_type)
Water_string = re.sub('[^\w\s\n\r\t]+', '-', Water_string)
time.sleep(1)
except:
try:
Water_use_type = driver.find_element(By.XPATH, '//*[@id="root"]/div/div[1]/div[7]/div/p[8]').text.split()
print(Water_use_type)
Water_string = ' '.join(Water_use_type)
Water_string = re.sub('[^\w\s\n\r\t]+', '-', Water_string)
time.sleep(1)
except:
Water_use_type = driver.find_element(By.XPATH, '//*[@id="root"]/div/div[1]/div[6]/div/p[8]').text.split()
print(Water_use_type)
Water_string = ' '.join(Water_use_type)
Water_string = re.sub('[^\w\s\n\r\t]+', '-', Water_string)
time.sleep(1)
# 用电类型
try:
Dian_use_type = driver.find_element(By.XPATH, '//*[@id="root"]/div/div[1]/div[8]/div/p[9]').text.split()
print(Dian_use_type)
Dian_string = ' '.join(Dian_use_type)
Dian_string = re.sub('[^\w\s\n\r\t]+', '-', Dian_string)
time.sleep(1)
except:
try:
Dian_use_type = driver.find_element(By.XPATH, '//*[@id="root"]/div/div[1]/div[7]/div/p[9]').text.split()
print(Dian_use_type)
Dian_string = ' '.join(Dian_use_type)
Dian_string = re.sub('[^\w\s\n\r\t]+', '-', Dian_string)
time.sleep(1)
except:
Dian_use_type = driver.find_element(By.XPATH, '//*[@id="root"]/div/div[1]/div[6]/div/p[9]').text.split()
print(Dian_use_type)
Dian_string = ' '.join(Dian_use_type)
Dian_string = re.sub('[^\w\s\n\r\t]+', '-', Dian_string)
time.sleep(1)
# 固定车位数
try:
parking_space = driver.find_element(By.XPATH, '//*[@id="root"]/div/div[1]/div[8]/div/p[10]').text.split()
print(parking_space)
if "固定车位数" in property_fee:
parking_space_string = ' '.join(parking_space)
parking_space_string = re.sub('[^\w\s\n\r\t]+', '-', parking_space_string)
else:
parking_space_string = ' '.join(" ")
parking_space_string = re.sub('[^\w\s\n\r\t]+', '-', parking_space_string)
time.sleep(1)
except:
try:
parking_space = driver.find_element(By.XPATH, '//*[@id="root"]/div/div[1]/div[7]/div/p[10]').text.split()
print(parking_space)
if "固定车位数" in parking_space:
parking_space_string = ' '.join(parking_space)
parking_space_string = re.sub('[^\w\s\n\r\t]+', '-', parking_space_string)
else:
parking_space_string = ' '.join(" ")
parking_space_string = re.sub('[^\w\s\n\r\t]+', '-', parking_space_string)
time.sleep(1)
except:
parking_space = driver.find_element(By.XPATH, '//*[@id="root"]/div/div[1]/div[6]/div/p[10]').text.split()
print(parking_space)
if "固定车位数" in property_fee:
parking_space_string = ' '.join(parking_space)
parking_space_string = re.sub('[^\w\s\n\r\t]+', '-', parking_space_string)
else:
parking_space_string = ' '.join(" ")
parking_space_string = re.sub('[^\w\s\n\r\t]+', '-', parking_space_string)
time.sleep(1)
# 停车费用
try:
parking_fee = driver.find_element(By.XPATH, '//*[@id="root"]/div/div[1]/div[8]/div/p[11]').text.split()
print(parking_fee)
parking_fee_string = ' '.join(parking_fee)
parking_fee_string = re.sub('[^\w\s\n\r\t]+', '-', parking_fee_string)
time.sleep(1)
except:
try:
parking_fee = driver.find_element(By.XPATH, '//*[@id="root"]/div/div[1]/div[7]/div/p[11]').text.split()
print(parking_fee)
parking_fee_string = ' '.join(parking_fee)
parking_fee_string = re.sub('[^\w\s\n\r\t]+', '-', parking_fee_string)
time.sleep(1)
except:
parking_fee = driver.find_element(By.XPATH, '//*[@id="root"]/div/div[1]/div[6]/div/p[11]').text.split()
print(parking_fee)
parking_fee_string = ' '.join(parking_fee)
parking_fee_string = re.sub('[^\w\s\n\r\t]+', '-', parking_fee_string)
time.sleep(1)
# 燃气费用
try:
gas_fee = driver.find_element(By.XPATH, '//*[@id="root"]/div/div[1]/div[8]/div/p[12]').text.split()
print(gas_fee)
gas_fee_string = ' '.join(gas_fee)
gas_fee_string = re.sub('[^\w\s\n\r\t]+', '-', gas_fee_string)
time.sleep(1)
except:
try:
gas_fee = driver.find_element(By.XPATH, '//*[@id="root"]/div/div[1]/div[7]/div/p[12]').text.split()
print(gas_fee)
gas_fee_string = ' '.join(gas_fee)
gas_fee_string = re.sub('[^\w\s\n\r\t]+', '-', gas_fee_string)
time.sleep(1)
except:
gas_fee = driver.find_element(By.XPATH, '//*[@id="root"]/div/div[1]/div[6]/div/p[12]').text.split()
print(gas_fee)
gas_fee_string = ' '.join(gas_fee)
gas_fee_string = re.sub('[^\w\s\n\r\t]+', '-', gas_fee_string)
time.sleep(1)
5.使用excel将爬出来的数据导入excel中
temp_df = pd.DataFrame({'小区地址': [address_string],
'小区名称': [name_string],
'物业费': [property_feestring],
'物业公司': [property_company_string],
'物业开发商': [developers_string],
'房屋用途': [house_string],
'建筑类型': [building_types_string],
'交易权属': [Transaction_ownership_string],
'用水类型': [Water_string],
'用电类型': [Dian_string],
'固定车位数': [parking_space_string],
'停车费用': [parking_fee_string],
'燃气费用': [gas_fee_string],
'容积率': [plot_ratio_string],
'绿化率': [greening_rate_string],
'建房年代': [datatime_string]
})
df = pd.concat([df, temp_df], ignore_index=True)
df.to_excel("C:/Users/Administrator/Desktop/小区信息.xlsx", index=False)
driver.quit()
6.完整代码
import re
from selenium.webdriver.chrome.service import Service
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
import pandas as pd
driver = webdriver.Chrome(service=Service(r"D:\application\Chrome\chromedriver-win64\chromedriver.exe"))
driver.get('https://m.ke.com/jx/bangdan/hainingshi1/ibd3?source=ershou/liebiao/bangdan')
driver.maximize_window()
time.sleep(5)
df = pd.DataFrame()
for i in range(1,10,2):
print(i)
driver.find_element(By.XPATH,f'//*[@id="root"]/div/div[2]/div[2]/div[2]/div/div/div[1]/div/div[{i}]/div/div[1]/a/img').click()
time.sleep(3)
# 执行JavaScript代码,进入新页面并下滑30%//*[@class='xiaoqu-page_basic-info']//p[text()="房屋用途"]
driver.execute_script("window.scrollTo(0, document.body.scrollHeight * 0.3);")
time.sleep(3)
driver.switch_to.window(driver.window_handles[-1])
# 查看小区基本信息
text_to_find = "查看全部信息" # 要查找的页面文本
time.sleep(2)
xpath_expression = f"//*[contains(text(), '{text_to_find}')]" # 构造 XPath 表达式
element = driver.find_element(By.XPATH, xpath_expression) # 通过 XPath 定位元素
time.sleep(3)
# 对定位到的元素执行操作
element.click()
time.sleep(5)
# 小区名称
name = driver.find_element(By.XPATH, '//*[@id="root"]/div/div[1]/div[2]/div[1]/span[1]').text.split()
print(name)
name_string = ' '.join(name)
name_string = re.sub('[^\w\s\n\r\t]+', '-', name_string)
time.sleep(1)
# 小区地址
address = driver.find_element(By.CLASS_NAME, 'map-address').text.split()
print(address)
address_string = ' '.join(address)
address_string = re.sub('[^\w\s\n\r\t]+', '-', address_string)
time.sleep(1)
# 物业费
property_fee1 = driver.find_element(By.XPATH, "//*[@class='item']//span[text()='建筑类型']/..").text.split()
print(property_fee1)
try:
property_fee = driver.find_element(By.XPATH,'//*[@id="root"]/div/div[1]/div[8]/div/p[15]').text.split()
print(property_fee)
if "物业费" in property_fee:
property_feestring = ' '.join(property_fee)
property_feestring = re.sub('[^\w\s\n\r\t]+', '-', property_feestring)
else:
property_feestring = ' '.join(" ")
property_feestring = re.sub('[^\w\s\n\r\t]+', '-', property_feestring)
time.sleep(1)
except:
try:
property_fee = driver.find_element(By.XPATH, '//*[@id="root"]/div/div[1]/div[7]/div/p[15]').text.split()
print(property_fee)
property_feestring = ' '.join(property_fee)
property_feestring = re.sub('[^\w\s\n\r\t]+', '-', property_feestring)
time.sleep(1)
except:
try:
property_fee = driver.find_element(By.XPATH, '//*[@id="root"]/div/div[1]/div[8]/div/p[14]').text.split()
print(property_fee)
property_feestring = ' '.join(property_fee)
property_feestring = re.sub('[^\w\s\n\r\t]+', '-', property_feestring)
time.sleep(1)
except:
try:
property_fee = driver.find_element(By.XPATH, '//*[@id="root"]/div/div[1]/div[8]/div/p[13]').text.split()
print(property_fee)
property_feestring = ' '.join(property_fee)
property_feestring = re.sub('[^\w\s\n\r\t]+', '-', property_feestring)
time.sleep(1)
except:
try:
property_fee = driver.find_element(By.XPATH,'//*[@id="root"]/div/div[1]/div[6]/div/p[13]').text.split()
print(property_fee)
property_feestring = ' '.join(property_fee)
property_feestring = re.sub('[^\w\s\n\r\t]+', '-', property_feestring)
time.sleep(1)
except:
try:
property_fee = driver.find_element(By.XPATH,'//*[@id="root"]/div/div[1]/div[7]/div/p[14]').text.split()
print(property_fee)
property_feestring = ' '.join(property_fee)
property_feestring = re.sub('[^\w\s\n\r\t]+', '-', property_feestring)
time.sleep(1)
except:
property_fee = driver.find_element(By.XPATH,'//*[@id="root"]/div/div[1]/div[7]/div/p[13]').text.split()
print(property_fee)
property_feestring = ' '.join(property_fee)
property_feestring = re.sub('[^\w\s\n\r\t]+', '-', property_feestring)
time.sleep(1)
# 物业公司
try:
property_company = driver.find_element(By.XPATH, '//*[@id="root"]/div/div[1]/div[8]/div/p[6]').text.split()
print(property_company)
property_company_string = ' '.join(property_company)
property_company_string = re.sub('[^\w\s\n\r\t]+', '-', property_company_string)
time.sleep(1)
except:
try:
property_company = driver.find_element(By.XPATH, '//*[@id="root"]/div/div[1]/div[7]/div/p[6]').text.split()
print(property_company)
property_company_string = ' '.join(property_company)
property_company_string = re.sub('[^\w\s\n\r\t]+', '-', property_company_string)
time.sleep(1)
except:
property_company = driver.find_element(By.XPATH, '//*[@id="root"]/div/div[1]/div[6]/div/p[6]').text.split()
print(property_company)
property_company_string = ' '.join(property_company)
property_company_string = re.sub('[^\w\s\n\r\t]+', '-', property_company_string)
time.sleep(1)
# 物业开发商
try:
developers = driver.find_element(By.XPATH, '//*[@id="root"]/div/div[1]/div[8]/div/p[4]').text.split()
print(developers)
developers_string = ' '.join(property_company)
developers_string = re.sub('[^\w\s\n\r\t]+', '-', developers_string)
time.sleep(1)
except:
try:
developers = driver.find_element(By.XPATH, '//*[@id="root"]/div/div[1]/div[7]/div/p[4]').text.split()
print(developers)
developers_string = ' '.join(property_company)
developers_string = re.sub('[^\w\s\n\r\t]+', '-', developers_string)
time.sleep(1)
except:
developers = driver.find_element(By.XPATH, '//*[@id="root"]/div/div[1]/div[6]/div/p[4]').text.split()
print(developers)
developers_string = ' '.join(property_company)
developers_string = re.sub('[^\w\s\n\r\t]+', '-', developers_string)
time.sleep(1)
# 房屋用途
try:
house = driver.find_element(By.XPATH, '//*[@id="root"]/div/div[1]/div[8]/div/p[2]').text.split()
print(house)
house_string = ' '.join(house)
house_string = re.sub('[^\w\s\n\r\t]+', '-', house_string)
time.sleep(1)
except:
try:
house = driver.find_element(By.XPATH, '//*[@id="root"]/div/div[1]/div[7]/div/p[2]').text.split()
print(house)
house_string = ' '.join(house)
house_string = re.sub('[^\w\s\n\r\t]+', '-', house_string)
time.sleep(1)
except:
house = driver.find_element(By.XPATH, '//*[@id="root"]/div/div[1]/div[6]/div/p[2]').text.split()
print(house)
house_string = ' '.join(house)
house_string = re.sub('[^\w\s\n\r\t]+', '-', house_string)
time.sleep(1)
# 建筑类型
try:
building_types = driver.find_element(By.XPATH,'//*[@id="root"]/div/div[1]/div[8]/div/p[3]').text.split()
print(building_types)
building_types_string = ' '.join(building_types)
building_types_string = re.sub('[^\w\s\n\r\t]+', '-', building_types_string)
time.sleep(1)
except:
try:
building_types = driver.find_element(By.XPATH, '//*[@id="root"]/div/div[1]/div[7]/div/p[3]').text.split()
print(building_types)
building_types_string = ' '.join(building_types)
building_types_string = re.sub('[^\w\s\n\r\t]+', '-', building_types_string)
time.sleep(1)
except:
building_types = driver.find_element(By.XPATH, '//*[@id="root"]/div/div[1]/div[6]/div/p[3]').text.split()
print(building_types)
building_types_string = ' '.join(building_types)
building_types_string = re.sub('[^\w\s\n\r\t]+', '-', building_types_string)
time.sleep(1)
# 交易权属
try:
Transaction_ownership = driver.find_element(By.XPATH, '//*[@id="root"]/div/div[1]/div[8]/div/p[5]').text.split()
print(Transaction_ownership)
Transaction_ownership_string = ' '.join(Transaction_ownership)
Transaction_ownership_string = re.sub('[^\w\s\n\r\t]+', '-', Transaction_ownership_string)
time.sleep(1)
except:
try:
Transaction_ownership = driver.find_element(By.XPATH, '//*[@id="root"]/div/div[1]/div[7]/div/p[5]').text.split()
print(Transaction_ownership)
Transaction_ownership_string = ' '.join(Transaction_ownership)
Transaction_ownership_string = re.sub('[^\w\s\n\r\t]+', '-', Transaction_ownership_string)
time.sleep(1)
except:
Transaction_ownership = driver.find_element(By.XPATH,
'//*[@id="root"]/div/div[1]/div[6]/div/p[5]').text.split()
print(Transaction_ownership)
Transaction_ownership_string = ' '.join(Transaction_ownership)
Transaction_ownership_string = re.sub('[^\w\s\n\r\t]+', '-', Transaction_ownership_string)
time.sleep(1)
# 用水类型
try:
Water_use_type = driver.find_element(By.XPATH, '//*[@id="root"]/div/div[1]/div[8]/div/p[8]').text.split()
print(Water_use_type)
Water_string = ' '.join(Water_use_type)
Water_string = re.sub('[^\w\s\n\r\t]+', '-', Water_string)
time.sleep(1)
except:
try:
Water_use_type = driver.find_element(By.XPATH, '//*[@id="root"]/div/div[1]/div[7]/div/p[8]').text.split()
print(Water_use_type)
Water_string = ' '.join(Water_use_type)
Water_string = re.sub('[^\w\s\n\r\t]+', '-', Water_string)
time.sleep(1)
except:
Water_use_type = driver.find_element(By.XPATH, '//*[@id="root"]/div/div[1]/div[6]/div/p[8]').text.split()
print(Water_use_type)
Water_string = ' '.join(Water_use_type)
Water_string = re.sub('[^\w\s\n\r\t]+', '-', Water_string)
time.sleep(1)
# 用电类型
try:
Dian_use_type = driver.find_element(By.XPATH, '//*[@id="root"]/div/div[1]/div[8]/div/p[9]').text.split()
print(Dian_use_type)
Dian_string = ' '.join(Dian_use_type)
Dian_string = re.sub('[^\w\s\n\r\t]+', '-', Dian_string)
time.sleep(1)
except:
try:
Dian_use_type = driver.find_element(By.XPATH, '//*[@id="root"]/div/div[1]/div[7]/div/p[9]').text.split()
print(Dian_use_type)
Dian_string = ' '.join(Dian_use_type)
Dian_string = re.sub('[^\w\s\n\r\t]+', '-', Dian_string)
time.sleep(1)
except:
Dian_use_type = driver.find_element(By.XPATH, '//*[@id="root"]/div/div[1]/div[6]/div/p[9]').text.split()
print(Dian_use_type)
Dian_string = ' '.join(Dian_use_type)
Dian_string = re.sub('[^\w\s\n\r\t]+', '-', Dian_string)
time.sleep(1)
# 固定车位数
try:
parking_space = driver.find_element(By.XPATH, '//*[@id="root"]/div/div[1]/div[8]/div/p[10]').text.split()
print(parking_space)
if "固定车位数" in property_fee:
parking_space_string = ' '.join(parking_space)
parking_space_string = re.sub('[^\w\s\n\r\t]+', '-', parking_space_string)
else:
parking_space_string = ' '.join(" ")
parking_space_string = re.sub('[^\w\s\n\r\t]+', '-', parking_space_string)
time.sleep(1)
except:
try:
parking_space = driver.find_element(By.XPATH, '//*[@id="root"]/div/div[1]/div[7]/div/p[10]').text.split()
print(parking_space)
if "固定车位数" in parking_space:
parking_space_string = ' '.join(parking_space)
parking_space_string = re.sub('[^\w\s\n\r\t]+', '-', parking_space_string)
else:
parking_space_string = ' '.join(" ")
parking_space_string = re.sub('[^\w\s\n\r\t]+', '-', parking_space_string)
time.sleep(1)
except:
parking_space = driver.find_element(By.XPATH, '//*[@id="root"]/div/div[1]/div[6]/div/p[10]').text.split()
print(parking_space)
if "固定车位数" in property_fee:
parking_space_string = ' '.join(parking_space)
parking_space_string = re.sub('[^\w\s\n\r\t]+', '-', parking_space_string)
else:
parking_space_string = ' '.join(" ")
parking_space_string = re.sub('[^\w\s\n\r\t]+', '-', parking_space_string)
time.sleep(1)
# 停车费用
try:
parking_fee = driver.find_element(By.XPATH, '//*[@id="root"]/div/div[1]/div[8]/div/p[11]').text.split()
print(parking_fee)
parking_fee_string = ' '.join(parking_fee)
parking_fee_string = re.sub('[^\w\s\n\r\t]+', '-', parking_fee_string)
time.sleep(1)
except:
try:
parking_fee = driver.find_element(By.XPATH, '//*[@id="root"]/div/div[1]/div[7]/div/p[11]').text.split()
print(parking_fee)
parking_fee_string = ' '.join(parking_fee)
parking_fee_string = re.sub('[^\w\s\n\r\t]+', '-', parking_fee_string)
time.sleep(1)
except:
parking_fee = driver.find_element(By.XPATH, '//*[@id="root"]/div/div[1]/div[6]/div/p[11]').text.split()
print(parking_fee)
parking_fee_string = ' '.join(parking_fee)
parking_fee_string = re.sub('[^\w\s\n\r\t]+', '-', parking_fee_string)
time.sleep(1)
# 燃气费用
try:
gas_fee = driver.find_element(By.XPATH, '//*[@id="root"]/div/div[1]/div[8]/div/p[12]').text.split()
print(gas_fee)
gas_fee_string = ' '.join(gas_fee)
gas_fee_string = re.sub('[^\w\s\n\r\t]+', '-', gas_fee_string)
time.sleep(1)
except:
try:
gas_fee = driver.find_element(By.XPATH, '//*[@id="root"]/div/div[1]/div[7]/div/p[12]').text.split()
print(gas_fee)
gas_fee_string = ' '.join(gas_fee)
gas_fee_string = re.sub('[^\w\s\n\r\t]+', '-', gas_fee_string)
time.sleep(1)
except:
gas_fee = driver.find_element(By.XPATH, '//*[@id="root"]/div/div[1]/div[6]/div/p[12]').text.split()
print(gas_fee)
gas_fee_string = ' '.join(gas_fee)
gas_fee_string = re.sub('[^\w\s\n\r\t]+', '-', gas_fee_string)
time.sleep(1)
# 容积率
try:
plot_ratio = driver.find_element(By.XPATH, '//*[@id="root"]/div/div[1]/div[8]/div/p[13]').text.split()
print(plot_ratio)
plot_ratio_string = ' '.join(plot_ratio)
plot_ratio_string = re.sub('[^\w\s\n\r\t]+', '-', plot_ratio_string)
time.sleep(1)
except:
try:
plot_ratio = driver.find_element(By.XPATH, '//*[@id="root"]/div/div[1]/div[7]/div/p[13]').text.split()
print(plot_ratio)
plot_ratio_string = ' '.join(plot_ratio)
plot_ratio_string = re.sub('[^\w\s\n\r\t]+', '-', plot_ratio_string)
time.sleep(1)
except:
plot_ratio = driver.find_element(By.XPATH, '//*[@id="root"]/div/div[1]/div[6]/div/p[13]').text.split()
print(plot_ratio)
plot_ratio_string = ' '.join(plot_ratio)
plot_ratio_string = re.sub('[^\w\s\n\r\t]+', '-', plot_ratio_string)
time.sleep(1)
# 绿化率
try:
greening_rate = driver.find_element(By.XPATH, '//*[@id="root"]/div/div[1]/div[8]/div/p[14]').text.split()
print(greening_rate)
greening_rate_string = ' '.join(greening_rate)
greening_rate_string = re.sub('[^\w\s\n\r\t]+', '-', greening_rate_string)
time.sleep(1)
except:
try:
greening_rate = driver.find_element(By.XPATH, '//*[@id="root"]/div/div[1]/div[7]/div/p[14]').text.split()
print(greening_rate)
greening_rate_string = ' '.join(greening_rate)
greening_rate_string = re.sub('[^\w\s\n\r\t]+', '-', greening_rate_string)
time.sleep(1)
except:
try:
greening_rate = driver.find_element(By.XPATH, '//*[@id="root"]/div/div[1]/div[6]/div/p[14]').text.split()
print(greening_rate)
greening_rate_string = ' '.join(greening_rate)
greening_rate_string = re.sub('[^\w\s\n\r\t]+', '-', greening_rate_string)
time.sleep(1)
except:
print('没有下标为14的数据')
# 建房年代
try:
datatime = driver.find_element(By.XPATH, '//*[@id="root"]/div/div[1]/div[8]/div/p[1]').text.split()
print(datatime)
datatime_string = ' '.join(datatime)
datatime_string = re.sub('[^\w\s\n\r\t]+', '-', datatime_string)
driver.back()
time.sleep(1)
except:
try:
datatime = driver.find_element(By.XPATH, '//*[@id="root"]/div/div[1]/div[7]/div/p[1]').text.split()
print(datatime)
datatime_string = ' '.join(datatime)
datatime_string = re.sub('[^\w\s\n\r\t]+', '-', datatime_string)
driver.back()
time.sleep(1)
except:
datatime = driver.find_element(By.XPATH, '//*[@id="root"]/div/div[1]/div[6]/div/p[1]').text.split()
print(datatime)
datatime_string = ' '.join(datatime)
datatime_string = re.sub('[^\w\s\n\r\t]+', '-', datatime_string)
driver.back()
time.sleep(1)
temp_df = pd.DataFrame({'小区地址': [address_string],
'小区名称': [name_string],
'物业费': [property_feestring],
'物业公司': [property_company_string],
'物业开发商': [developers_string],
'房屋用途': [house_string],
'建筑类型': [building_types_string],
'交易权属': [Transaction_ownership_string],
'用水类型': [Water_string],
'用电类型': [Dian_string],
'固定车位数': [parking_space_string],
'停车费用': [parking_fee_string],
'燃气费用': [gas_fee_string],
'容积率': [plot_ratio_string],
'绿化率': [greening_rate_string],
'建房年代': [datatime_string]
})
df = pd.concat([df, temp_df], ignore_index=True)
df.to_excel("C:/Users/Administrator/Desktop/小区信息.xlsx", index=False)
driver.quit()
使用正则表达式获取页面元素可以使我们获取出来的数据内容更加准确,不管页面如何变化,都不会对我们的数据造成影响,否则页面只要发生变化我们的代码就需要跟着变化。