【python】anjuke小区信息收集

阿尔卑斯山林

已于 2023-04-30 22:54:01 修改

阅读量370

点赞数 2

分类专栏：爬虫 python程序文章标签： python 开发语言

于 2023-04-30 12:39:21 首次发布

本文链接：https://blog.csdn.net/weixin_47970003/article/details/130447665

版权

python程序同时被 2 个专栏收录

7 篇文章 0 订阅

订阅专栏

爬虫

5 篇文章 0 订阅

订阅专栏

1.首先在主页面中收集详情页面的url
主页面示例：
在这里插入图片描述

2.收集详情页面的小区的名称、房价、建筑年代、建筑类型、物业费用等各类信息
详情页面示例：
在这里插入图片描述
3.将小区的信息储存到excel中
效果如下：

4.完整示例代码如下：

import requests
from lxml import etree
import random
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
#1.爬取主页面中详情页面的url
url_temp = "https://shanghai.anjuke.com/community/pudong/p{}/"

user_agents = [
        'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Mobile Safari/537.36',
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36 Edg/112.0.1722.58'
    ]
UserAgent = random.choice(user_agents)
headers = {'User-Agent': UserAgent,'cookie':'',
'Accept':'',
'Referer':''}
href_list = []     #放置详情页面url的列表
for x in range(1,2):    #这里输入要爬取的页面，第几页到第几页
    page_url = url_temp.format(x)
    print(page_url)
    response = requests.get(page_url , headers = headers) 
    html_str = response.content.decode()    
    tree=etree.HTML(html_str)   
    titles=tree.xpath("//div[@class='nowrap-min li-community-title']")
    hrefs=tree.xpath("//a[@class='li-row']")  
    for href in hrefs:
        href1=href.get('href')
        print(href1)
        href_list.append(href1)  
#2.爬取详情页面信息
driver = webdriver.Chrome(executable_path="")   #输入chromedriver的路径
# chromedriver下载网站：https://registry.npmmirror.com/binary.html?path=chromedriver，选择与自己chrome浏览器相对应的版本
title_list = []
price_list = []
PropertyType_list = []
OwnershipCategory_list = []
CompletionTime_list = []
PropertyRightPeriod_list = []
DoorNumber_list = []
TotalConstructionArea_list = []
PlotRatio_list = []
GreeningRate_list = []
BuildingType_list = []
BusinessDistrict_list = []
ParkingSpace_list = []
PropertyPrice_list = []
ParkingFee_list = []
ParkingManagementFee_list = []
address_list = []
crawl_urls = []  #放置已经爬取的url的列表
#由于在爬取过程中，有时不能一次将主页面中出现的所有的小区的信息爬取下来，所以，采用while循环
#crawl_urls = []表示已经爬取的url，uncrawled_urls为没有爬取的ur
#使用while循环爬取没有爬取的url，直到len(crawl_urls)=len(href_list)，表示已经爬取的url的数量与所有url的数量相等，则表示所有的url都爬取到了，则跳出while
while len(crawl_urls)<len(href_list):
    uncrawled_urls = set(href_list) - set(crawl_urls)
    for url in uncrawled_urls:
        print(url)
        driver.get(url)
             # 如果是登录页面,则输入账号密码登录
        if 'login.anjuke.com' in driver.current_url and 'form' in driver.current_url:
            wait = WebDriverWait(driver, 10)
            wait.until(EC.presence_of_element_located((By.TAG_NAME, "body")))
            iframe = driver.find_element_by_id("iframeLoginIfm")
            driver.switch_to.frame(iframe)
            driver.find_element_by_id("pwdTab").click()
            driver.find_element_by_id("pwdUserNameIpt").send_keys("")   #输入你的账号
            driver.find_element_by_id("pwdIpt").send_keys("")     #输入你的密码
            driver.find_element_by_id("checkagree").click()
            driver.find_element_by_id("pwdSubmitBtn").click()
            #如果是滑块验证，就停留十秒，手动滑块验证
        elif 'www.anjuke.com' in driver.current_url and 'captcha-verify' in driver.current_url:
            time.sleep(10)
        else:
            try:
                title = driver.find_element_by_xpath("//h1[@class='title']").text 
                print(title)
            except Exception as e:
                pass
            try:
                price = driver.find_element_by_xpath("//span[@class='average']").text
            except Exception as e:
                pass
            try:
                PropertyType = driver.find_element_by_xpath("//div[@class='value value_0']").text
            except Exception as e:
                pass
            try:
                OwnershipCategory = driver.find_element_by_xpath("//div[@class='value value_1']").text
            except Exception as e:
                pass
            try:
                CompletionTime = driver.find_element_by_xpath("//div[@class='value value_2']").text
            except Exception as e:
                pass
            try:
                PropertyRightPeriod = driver.find_element_by_xpath("//div[@class='value value_3']").text
            except Exception as e:
                pass
            try:
                DoorNumber = driver.find_element_by_xpath("//div[@class='value value_4']").text
            except Exception as e:
                pass
            try:
                TotalConstructionArea = driver.find_element_by_xpath("//div[@class='value value_5']").text
            except Exception as e:
                pass
            try:
                PlotRatio = driver.find_element_by_xpath("//div[@class='value value_6']").text
            except Exception as e:
                pass
            try:
                GreeningRate = driver.find_element_by_xpath("//div[@class='value value_7']").text
            except Exception as e:
                pass
            try:
                BuildingType = driver.find_element_by_xpath("//div[@class='value value_8']").text
            except Exception as e:
                pass
            try:
                BusinessDistrict = driver.find_element_by_xpath("//div[@class='value value_9']").text
            except Exception as e:
                pass
            try:
                ParkingSpace = driver.find_element_by_xpath("//div[@class='value value_12']").text
            except Exception as e:
                pass
            try:
                PropertyPrice = driver.find_element_by_xpath("//div[@class='value value_13']").text
            except Exception as e:
                pass
            try:
                ParkingFee = driver.find_element_by_xpath("//*[@id='__layout']/div/div[2]/div[3]/div[2]/div[1]/div[2]/div/div[15]/div[2]").text
            except Exception as e:
                pass
            try:
                ParkingManagementFee = driver.find_element_by_xpath("//*[@id='__layout']/div/div[2]/div[3]/div[2]/div[1]/div[2]/div/div[16]/div[2]").text
            except Exception as e:
                pass
            try:
                address = driver.find_element_by_xpath("//*[@id='__layout']/div/div[2]/div[3]/div[2]/div[1]/div[2]/div/div[18]/div[2]").text
            except Exception as e:
                pass
            title_list.append(title)
            price_list.append(price)
            PropertyType_list.append(PropertyType)
            OwnershipCategory_list.append(OwnershipCategory)
            CompletionTime_list.append(CompletionTime)
            PropertyRightPeriod_list.append(PropertyRightPeriod)
            DoorNumber_list.append(DoorNumber)
            TotalConstructionArea_list.append(TotalConstructionArea)
            PlotRatio_list.append(PlotRatio)
            GreeningRate_list.append(GreeningRate)
            BuildingType_list.append(BuildingType)
            BusinessDistrict_list.append(BusinessDistrict)
            ParkingSpace_list.append(ParkingSpace)
            PropertyPrice_list.append(PropertyPrice)
            ParkingFee_list.append(ParkingFee)
            ParkingManagementFee_list.append(ParkingManagementFee)
            address_list.append(address)
            crawl_urls.append(url)   #将已经爬取的url放置在crawl_ulrs列表中
            driver.quit()
#3.将小区信息储存在excel
item = {'title': title_list, 'price': price_list, 'PropertyType': PropertyType_list,
        'OwnershipCategory': OwnershipCategory_list, 'CompletionTime': CompletionTime_list,
        'PropertyRightPeriod': PropertyRightPeriod_list, 'DoorNumber': DoorNumber_list,
        'TotalConstructionArea': TotalConstructionArea_list,
        'PlotRatio': PlotRatio_list,
        'GreeningRate': GreeningRate_list, 'BuildingType': BuildingType_list,
        'BusinessDistrict': BusinessDistrict_list,
        'ParkingSpace': ParkingSpace_list, 'PropertyPrice': PropertyPrice_list,
        'ParkingFee': ParkingFee_list, 'ParkingManagementFee': ParkingManagementFee_list,
        'address': address_list}     #构件储存字典
file_path = "./test.xlsx"    #文件储存路径
writer = pd.ExcelWriter(file_path)
df = pd.DataFrame(item)
df.to_excel(writer, 'sheet1', startcol=0, index=False)
writer.save()