1.代码可以直接运行,请下载anaconda并安装,用spyder方便查看变量
或者可以查看生成的excel文件
2.依赖库,命令行运行(WIN10打开命令行快捷键:windows+x组合键,然后按a键):
pip install BeautifulSoup4
pip install requests
3.爬取的网站是安居客(厦门)网站,可以进入https://xm.fang.anjuke.com/loupan/all/进行观察
4.关于如何判断代码是python2还是python3,print("")为python3,print ""为python2
# -*- coding: utf-8 -*-
"""
Created on Sun Jan 14 19:07:39 2018
@author: Steven Lei
"""
def getHousesDetails(url):
import requests
from bs4 import BeautifulSoup
request = requests.get(url)
request.encoding = "utf-8"
soup = BeautifulSoup(request.text,"lxml")
houses = soup.select(".item-mod")[3:]
housesDetails = []
for house in houses:
#获取楼盘名字
houseName = house.select(".items-name")[0].text
#获取楼盘价格
priceBefore = house.select(".price")
if(len(priceBefore) == 0):
priceBefore = house.select(".price-txt")
price = priceBefore[0].text
#获取楼盘地址
address = house.select(".list-map")[0].text
if(address[-1] == "."):
href = house.select(".pic")[0]["href"]
request = requests.get(href)
request.encoding = "utf-8"
soup = BeautifulSoup(request.text,"lxml")
address = soup.select(".lpAddr-text")[0].text
#获取房屋面积
houseSizeBefore = house.select(".huxing span")
if(len(houseSizeBefore) >0):
houseSize = houseSizeBefore[-1].text
else:
houseSize = ""
#获取销售状态
saleStatus = house.select(".tag-panel i")[0].text
#获取户型
if(len(house.select(".tag-panel i")) == 2):
houseType = house.select(".tag-panel i")[1].text
else:
houseType = house.select(".tag-panel span")[0].text
#将获取的信息做成房屋信息字典
houseDetail = {}
houseDetail["houseName"] = houseName
houseDetail["price"] = price
houseDetail["address"] = address
houseDetail["houseSize"] = houseSize
houseDetail["saleStatus"] = saleStatus
houseDetail["houseType"] = houseType
print(houseDetail)
housesDetails.append(houseDetail)
return housesDetails
def getAllHouseDetails():
import pandas
urlBefore = "https://xm.fang.anjuke.com/loupan/all/p{}/"
allHouseDetails = []
for i in range(1,8):
url = urlBefore.format(i)
allHouseDetails.extend(getHousesDetails(url))
dataframe = pandas.DataFrame(allHouseDetails)
return dataframe
if __name__ == "__main__":
#houseDetails = getHousesDetails("https://xm.fang.anjuke.com/loupan/all/p1/")
allHouseDetails = getAllHouseDetails()
allHouseDetails.to_excel("anjukeHousesDetails.xlsx")
print(allHouseDetails.head(10))