【python】anjuke小区信息收集

1.首先在主页面中收集详情页面的url
主页面示例:
在这里插入图片描述

2.收集详情页面的小区的名称、房价、建筑年代、建筑类型、物业费用等各类信息
详情页面示例:
在这里插入图片描述
3.将小区的信息储存到excel中
效果如下:
在这里插入图片描述
4.完整示例代码如下:

import requests
from lxml import etree
import random
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
#1.爬取主页面中详情页面的url
url_temp = "https://shanghai.anjuke.com/community/pudong/p{}/"

user_agents = [
        'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Mobile Safari/537.36',
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36 Edg/112.0.1722.58'
    ]
UserAgent = random.choice(user_agents)
headers = {'User-Agent': UserAgent,'cookie':'',
'Accept':'',
'Referer':''}
href_list = []     #放置详情页面url的列表
for x in range(1,2):    #这里输入要爬取的页面,第几页到第几页
    page_url = url_temp.format(x)
    print(page_url)
    response = requests.get(page_url , headers = headers) 
    html_str = response.content.decode()    
    tree=etree.HTML(html_str)   
    titles=tree.xpath("//div[@class='nowrap-min li-community-title']")
    hrefs=tree.xpath("//a[@class='li-row']")  
    for href in hrefs:
        href1=href.get('href')
        print(href1)
        href_list.append(href1)  
#2.爬取详情页面信息
driver = webdriver.Chrome(executable_path="")   #输入chromedriver的路径
# chromedriver下载网站:https://registry.npmmirror.com/binary.html?path=chromedriver,选择与自己chrome浏览器相对应的版本
title_list = []
price_list = []
PropertyType_list = []
OwnershipCategory_list = []
CompletionTime_list = []
PropertyRightPeriod_list = []
DoorNumber_list = []
TotalConstructionArea_list = []
PlotRatio_list = []
GreeningRate_list = []
BuildingType_list = []
BusinessDistrict_list = []
ParkingSpace_list = []
PropertyPrice_list = []
ParkingFee_list = []
ParkingManagementFee_list = []
address_list = []
crawl_urls = []  #放置已经爬取的url的列表
#由于在爬取过程中,有时不能一次将主页面中出现的所有的小区的信息爬取下来,所以,采用while循环
#crawl_urls = []表示已经爬取的url,uncrawled_urls为没有爬取的ur
#使用while循环爬取没有爬取的url,直到len(crawl_urls)=len(href_list),表示已经爬取的url的数量与所有url的数量相等,则表示所有的url都爬取到了,则跳出while
while len(crawl_urls)<len(href_list):
    uncrawled_urls = set(href_list) - set(crawl_urls)
    for url in uncrawled_urls:
        print(url)
        driver.get(url)
             # 如果是登录页面,则输入账号密码登录
        if 'login.anjuke.com' in driver.current_url and 'form' in driver.current_url:
            wait = WebDriverWait(driver, 10)
            wait.until(EC.presence_of_element_located((By.TAG_NAME, "body")))
            iframe = driver.find_element_by_id("iframeLoginIfm")
            driver.switch_to.frame(iframe)
            driver.find_element_by_id("pwdTab").click()
            driver.find_element_by_id("pwdUserNameIpt").send_keys("")   #输入你的账号
            driver.find_element_by_id("pwdIpt").send_keys("")     #输入你的密码
            driver.find_element_by_id("checkagree").click()
            driver.find_element_by_id("pwdSubmitBtn").click()
            #如果是滑块验证,就停留十秒,手动滑块验证
        elif 'www.anjuke.com' in driver.current_url and 'captcha-verify' in driver.current_url:
            time.sleep(10)
        else:
            try:
                title = driver.find_element_by_xpath("//h1[@class='title']").text 
                print(title)
            except Exception as e:
                pass
            try:
                price = driver.find_element_by_xpath("//span[@class='average']").text
            except Exception as e:
                pass
            try:
                PropertyType = driver.find_element_by_xpath("//div[@class='value value_0']").text
            except Exception as e:
                pass
            try:
                OwnershipCategory = driver.find_element_by_xpath("//div[@class='value value_1']").text
            except Exception as e:
                pass
            try:
                CompletionTime = driver.find_element_by_xpath("//div[@class='value value_2']").text
            except Exception as e:
                pass
            try:
                PropertyRightPeriod = driver.find_element_by_xpath("//div[@class='value value_3']").text
            except Exception as e:
                pass
            try:
                DoorNumber = driver.find_element_by_xpath("//div[@class='value value_4']").text
            except Exception as e:
                pass
            try:
                TotalConstructionArea = driver.find_element_by_xpath("//div[@class='value value_5']").text
            except Exception as e:
                pass
            try:
                PlotRatio = driver.find_element_by_xpath("//div[@class='value value_6']").text
            except Exception as e:
                pass
            try:
                GreeningRate = driver.find_element_by_xpath("//div[@class='value value_7']").text
            except Exception as e:
                pass
            try:
                BuildingType = driver.find_element_by_xpath("//div[@class='value value_8']").text
            except Exception as e:
                pass
            try:
                BusinessDistrict = driver.find_element_by_xpath("//div[@class='value value_9']").text
            except Exception as e:
                pass
            try:
                ParkingSpace = driver.find_element_by_xpath("//div[@class='value value_12']").text
            except Exception as e:
                pass
            try:
                PropertyPrice = driver.find_element_by_xpath("//div[@class='value value_13']").text
            except Exception as e:
                pass
            try:
                ParkingFee = driver.find_element_by_xpath("//*[@id='__layout']/div/div[2]/div[3]/div[2]/div[1]/div[2]/div/div[15]/div[2]").text
            except Exception as e:
                pass
            try:
                ParkingManagementFee = driver.find_element_by_xpath("//*[@id='__layout']/div/div[2]/div[3]/div[2]/div[1]/div[2]/div/div[16]/div[2]").text
            except Exception as e:
                pass
            try:
                address = driver.find_element_by_xpath("//*[@id='__layout']/div/div[2]/div[3]/div[2]/div[1]/div[2]/div/div[18]/div[2]").text
            except Exception as e:
                pass
            title_list.append(title)
            price_list.append(price)
            PropertyType_list.append(PropertyType)
            OwnershipCategory_list.append(OwnershipCategory)
            CompletionTime_list.append(CompletionTime)
            PropertyRightPeriod_list.append(PropertyRightPeriod)
            DoorNumber_list.append(DoorNumber)
            TotalConstructionArea_list.append(TotalConstructionArea)
            PlotRatio_list.append(PlotRatio)
            GreeningRate_list.append(GreeningRate)
            BuildingType_list.append(BuildingType)
            BusinessDistrict_list.append(BusinessDistrict)
            ParkingSpace_list.append(ParkingSpace)
            PropertyPrice_list.append(PropertyPrice)
            ParkingFee_list.append(ParkingFee)
            ParkingManagementFee_list.append(ParkingManagementFee)
            address_list.append(address)
            crawl_urls.append(url)   #将已经爬取的url放置在crawl_ulrs列表中
            driver.quit()
#3.将小区信息储存在excel
item = {'title': title_list, 'price': price_list, 'PropertyType': PropertyType_list,
        'OwnershipCategory': OwnershipCategory_list, 'CompletionTime': CompletionTime_list,
        'PropertyRightPeriod': PropertyRightPeriod_list, 'DoorNumber': DoorNumber_list,
        'TotalConstructionArea': TotalConstructionArea_list,
        'PlotRatio': PlotRatio_list,
        'GreeningRate': GreeningRate_list, 'BuildingType': BuildingType_list,
        'BusinessDistrict': BusinessDistrict_list,
        'ParkingSpace': ParkingSpace_list, 'PropertyPrice': PropertyPrice_list,
        'ParkingFee': ParkingFee_list, 'ParkingManagementFee': ParkingManagementFee_list,
        'address': address_list}     #构件储存字典
file_path = "./test.xlsx"    #文件储存路径
writer = pd.ExcelWriter(file_path)
df = pd.DataFrame(item)
df.to_excel(writer, 'sheet1', startcol=0, index=False)
writer.save()
  • 2
    点赞
  • 5
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值