1.首先在主页面中收集详情页面的url
主页面示例:
2.收集详情页面的小区的名称、房价、建筑年代、建筑类型、物业费用等各类信息
详情页面示例:
3.将小区的信息储存到excel中
效果如下:
4.完整示例代码如下:
import requests
from lxml import etree
import random
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
#1.爬取主页面中详情页面的url
url_temp = "https://shanghai.anjuke.com/community/pudong/p{}/"
user_agents = [
'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Mobile Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36 Edg/112.0.1722.58'
]
UserAgent = random.choice(user_agents)
headers = {'User-Agent': UserAgent,'cookie':'',
'Accept':'',
'Referer':''}
href_list = [] #放置详情页面url的列表
for x in range(1,2): #这里输入要爬取的页面,第几页到第几页
page_url = url_temp.format(x)
print(page_url)
response = requests.get(page_url , headers = headers)
html_str = response.content.decode()
tree=etree.HTML(html_str)
titles=tree.xpath("//div[@class='nowrap-min li-community-title']")
hrefs=tree.xpath("//a[@class='li-row']")
for href in hrefs:
href1=href.get('href')
print(href1)
href_list.append(href1)
#2.爬取详情页面信息
driver = webdriver.Chrome(executable_path="") #输入chromedriver的路径
# chromedriver下载网站:https://registry.npmmirror.com/binary.html?path=chromedriver,选择与自己chrome浏览器相对应的版本
title_list = []
price_list = []
PropertyType_list = []
OwnershipCategory_list = []
CompletionTime_list = []
PropertyRightPeriod_list = []
DoorNumber_list = []
TotalConstructionArea_list = []
PlotRatio_list = []
GreeningRate_list = []
BuildingType_list = []
BusinessDistrict_list = []
ParkingSpace_list = []
PropertyPrice_list = []
ParkingFee_list = []
ParkingManagementFee_list = []
address_list = []
crawl_urls = [] #放置已经爬取的url的列表
#由于在爬取过程中,有时不能一次将主页面中出现的所有的小区的信息爬取下来,所以,采用while循环
#crawl_urls = []表示已经爬取的url,uncrawled_urls为没有爬取的ur
#使用while循环爬取没有爬取的url,直到len(crawl_urls)=len(href_list),表示已经爬取的url的数量与所有url的数量相等,则表示所有的url都爬取到了,则跳出while
while len(crawl_urls)<len(href_list):
uncrawled_urls = set(href_list) - set(crawl_urls)
for url in uncrawled_urls:
print(url)
driver.get(url)
# 如果是登录页面,则输入账号密码登录
if 'login.anjuke.com' in driver.current_url and 'form' in driver.current_url:
wait = WebDriverWait(driver, 10)
wait.until(EC.presence_of_element_located((By.TAG_NAME, "body")))
iframe = driver.find_element_by_id("iframeLoginIfm")
driver.switch_to.frame(iframe)
driver.find_element_by_id("pwdTab").click()
driver.find_element_by_id("pwdUserNameIpt").send_keys("") #输入你的账号
driver.find_element_by_id("pwdIpt").send_keys("") #输入你的密码
driver.find_element_by_id("checkagree").click()
driver.find_element_by_id("pwdSubmitBtn").click()
#如果是滑块验证,就停留十秒,手动滑块验证
elif 'www.anjuke.com' in driver.current_url and 'captcha-verify' in driver.current_url:
time.sleep(10)
else:
try:
title = driver.find_element_by_xpath("//h1[@class='title']").text
print(title)
except Exception as e:
pass
try:
price = driver.find_element_by_xpath("//span[@class='average']").text
except Exception as e:
pass
try:
PropertyType = driver.find_element_by_xpath("//div[@class='value value_0']").text
except Exception as e:
pass
try:
OwnershipCategory = driver.find_element_by_xpath("//div[@class='value value_1']").text
except Exception as e:
pass
try:
CompletionTime = driver.find_element_by_xpath("//div[@class='value value_2']").text
except Exception as e:
pass
try:
PropertyRightPeriod = driver.find_element_by_xpath("//div[@class='value value_3']").text
except Exception as e:
pass
try:
DoorNumber = driver.find_element_by_xpath("//div[@class='value value_4']").text
except Exception as e:
pass
try:
TotalConstructionArea = driver.find_element_by_xpath("//div[@class='value value_5']").text
except Exception as e:
pass
try:
PlotRatio = driver.find_element_by_xpath("//div[@class='value value_6']").text
except Exception as e:
pass
try:
GreeningRate = driver.find_element_by_xpath("//div[@class='value value_7']").text
except Exception as e:
pass
try:
BuildingType = driver.find_element_by_xpath("//div[@class='value value_8']").text
except Exception as e:
pass
try:
BusinessDistrict = driver.find_element_by_xpath("//div[@class='value value_9']").text
except Exception as e:
pass
try:
ParkingSpace = driver.find_element_by_xpath("//div[@class='value value_12']").text
except Exception as e:
pass
try:
PropertyPrice = driver.find_element_by_xpath("//div[@class='value value_13']").text
except Exception as e:
pass
try:
ParkingFee = driver.find_element_by_xpath("//*[@id='__layout']/div/div[2]/div[3]/div[2]/div[1]/div[2]/div/div[15]/div[2]").text
except Exception as e:
pass
try:
ParkingManagementFee = driver.find_element_by_xpath("//*[@id='__layout']/div/div[2]/div[3]/div[2]/div[1]/div[2]/div/div[16]/div[2]").text
except Exception as e:
pass
try:
address = driver.find_element_by_xpath("//*[@id='__layout']/div/div[2]/div[3]/div[2]/div[1]/div[2]/div/div[18]/div[2]").text
except Exception as e:
pass
title_list.append(title)
price_list.append(price)
PropertyType_list.append(PropertyType)
OwnershipCategory_list.append(OwnershipCategory)
CompletionTime_list.append(CompletionTime)
PropertyRightPeriod_list.append(PropertyRightPeriod)
DoorNumber_list.append(DoorNumber)
TotalConstructionArea_list.append(TotalConstructionArea)
PlotRatio_list.append(PlotRatio)
GreeningRate_list.append(GreeningRate)
BuildingType_list.append(BuildingType)
BusinessDistrict_list.append(BusinessDistrict)
ParkingSpace_list.append(ParkingSpace)
PropertyPrice_list.append(PropertyPrice)
ParkingFee_list.append(ParkingFee)
ParkingManagementFee_list.append(ParkingManagementFee)
address_list.append(address)
crawl_urls.append(url) #将已经爬取的url放置在crawl_ulrs列表中
driver.quit()
#3.将小区信息储存在excel
item = {'title': title_list, 'price': price_list, 'PropertyType': PropertyType_list,
'OwnershipCategory': OwnershipCategory_list, 'CompletionTime': CompletionTime_list,
'PropertyRightPeriod': PropertyRightPeriod_list, 'DoorNumber': DoorNumber_list,
'TotalConstructionArea': TotalConstructionArea_list,
'PlotRatio': PlotRatio_list,
'GreeningRate': GreeningRate_list, 'BuildingType': BuildingType_list,
'BusinessDistrict': BusinessDistrict_list,
'ParkingSpace': ParkingSpace_list, 'PropertyPrice': PropertyPrice_list,
'ParkingFee': ParkingFee_list, 'ParkingManagementFee': ParkingManagementFee_list,
'address': address_list} #构件储存字典
file_path = "./test.xlsx" #文件储存路径
writer = pd.ExcelWriter(file_path)
df = pd.DataFrame(item)
df.to_excel(writer, 'sheet1', startcol=0, index=False)
writer.save()