Python——安居客租房信息爬取(以南昌为例)

前言:

  1. 提前安装好所需要的库。
  2. 本代码的输入仅需要某个城市的租房地址首页即可,其他自会生成。
  3. 使用前请创建所需的目录,或者为代码添加os.makedir()
  4. 支持断点重爬,重行运行即可。
  5. headers等随运行环境不同,可能需要进行修改。
  6. 本代码使用了高德API key,用于获取地理坐标,但发布时已略去,如需使用,请注册高德api开发者。
  7. 内容原创,引用请注明出处。Note: http://www.cnblogs.com/shadrach; author: shadrach@yeah.net。
# author: shadrach@yeah.net
# blog: http://www.cnblogs.com/shadrach
# NOTE: original article, indicate the source if reprint. 
# Thanks.
# Update: 2018/1/24
import urllib.request
from bs4 import BeautifulSoup
import xlsxwriter
import xlrd
import os
import math
import time
import glob

# coordinate convert: from gcj(amap) to wgs(gps)
def GCJ2WGS(location):
# location格式如下:locations[1] = "113.923745,22.530824"
    lon = float(location[0:location.find(",")])
    lat = float(location[location.find(",") + 1:len(location)])
    a = 6378245.0 # 克拉索夫斯基椭球参数长半轴a
    ee = 0.00669342162296594323 #克拉索夫斯基椭球参数第一偏心率平方
    PI = 3.14159265358979324 # 圆周率
    # 以下为转换公式
    x = lon - 105.0
    y = lat - 35.0
    # 经度
    dLon = 300.0 + x + 2.0 * y + 0.1 * x * x + 0.1 * x * y + 0.1 * math.sqrt(abs(x))
    dLon += (20.0 * math.sin(6.0 * x * PI) + 20.0 * math.sin(2.0 * x * PI)) * 2.0 / 3.0
    dLon += (20.0 * math.sin(x * PI) + 40.0 * math.sin(x / 3.0 * PI)) * 2.0 / 3.0
    dLon += (150.0 * math.sin(x / 12.0 * PI) + 300.0 * math.sin(x / 30.0 * PI)) * 2.0 / 3.0
    #维度
    dLat = -100.0 + 2.0 * x + 3.0 * y + 0.2 * y * y + 0.1 * x * y + 0.2 * math.sqrt(abs(x))
    dLat += (20.0 * math.sin(6.0 * x * PI) + 20.0 * math.sin(2.0 * x * PI)) * 2.0 / 3.0
    dLat += (20.0 * math.sin(y * PI) + 40.0 * math.sin(y / 3.0 * PI)) * 2.0 / 3.0
    dLat += (160.0 * math.sin(y / 12.0 * PI) + 320 * math.sin(y * PI / 30.0)) * 2.0 / 3.0
    radLat = lat / 180.0 * PI
    magic = math.sin(radLat)
    magic = 1 - ee * magic * magic
    sqrtMagic = math.sqrt(magic)
    dLat = (dLat * 180.0) / ((a * (1 - ee)) / (magic * sqrtMagic) * PI)
    dLon = (dLon * 180.0) / (a / sqrtMagic * math.cos(radLat) * PI)
    wgsLon = lon - dLon
    wgsLat = lat - dLat
    return wgsLon,wgsLat

# xlsx files merge
def xlsx_merge(fileLocation,header,filename):
    fileList = []
    for fileName in glob.glob(fileLocation + "*.xlsx"):
        fileList.append(fileName)
    fileNum = len(fileList)
    matrix = [None] * fileNum
    for i in range(fileNum):
        fileName = fileList[i]
        workBook = xlrd.open_workbook(fileName)
        try:
            sheet = workBook.sheet_by_index(0)
        except Exception as e:
            print(e)
        nRows = sheet.nrows
        matrix[i] = [0]*(nRows - 1)
        nCols = sheet.ncols
        for m in range(nRows - 1):
            matrix[i][m] = ["0"]* nCols
        for j in range(1,nRows):
            for k in range(nCols):
                matrix[i][j-1][k] = sheet.cell(j,k).value
    fileName = xlsxwriter.Workbook(fileLocation + filename + ".xlsx")
    sheet = fileName.add_worksheet("merged")
    for i in range(len(header)):
        sheet.write(0,i,header[i])
    rowIndex = 1
    for fileIndex in range(fileNum):
        for j in range(len(matrix[fileIndex])):
            for colIndex in range (len(matrix[fileIndex][j])):
                sheet.write(rowIndex,colIndex,matrix[fileIndex][j][colIndex])
            rowIndex += 1
    print(time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())) + ": "+ "已完成%d个文件的合并"%fileNum)
    fileName.close()

# uniform request
def soup_form(url,referer):  
    headers = {
            "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
#             "Accept-Encoding":"gzip, deflate, sdch, br",  # 解码错误,注释
            "Accept-Language":"zh-CN,zh;q=0.8",
            "Cache-Control":"max-age=0",
            "Connection":"keep-alive",
            "Cookie":"lps=http%3A%2F%2Fwww.anjuke.com%2F%3Fpi%3DPZ-baidu-pc-all-biaoti%7Chttp%3A%2F%2Fbzclk.baidu.com%2Fadrc.php%3Ft%3D06KL00c00fDgzw60mUFU00PpAs0Mhyup00000PkqW-b00000uN71Vj.THvs_oeHEtY0UWdBmy-bIfK15yNBnHfkrjfLnj0sn1bdmWD0IHYLfbcsnYuKwj-7f1KKfHT4nj0sPYRvwj0dPDFanYFKfsK95gTqFhdWpyfqn103nWfLP1ndniusThqbpyfqnHm0uHdCIZwsT1CEQLILIz49UhGdpvR8mvqVQ1qspHdfyBdBmy-bIidsmzd9UAsVmh-9ULwG0APzm1YkrH6dP0%26tpl%3Dtpl_10085_16624_12226%26l%3D1502510556%26attach%3Dlocation%253D%2526linkName%253D%2525E6%2525A0%252587%2525E9%2525A2%252598%2526linkText%253D%2525E5%2525AE%252589%2525E5%2525B1%252585%2525E5%2525AE%2525A2-%2525E5%25259B%2525BD%2525E5%252586%252585%2525E9%2525A2%252586%2525E5%252585%252588%2525E6%252589%2525BE%2525E6%252588%2525BF%2525E5%2525B9%2525B3%2525E5%25258F%2525B0%2525EF%2525BC%25258C%2525E5%2525AE%252589%2525E5%2525BF%252583%2525E6%25258C%252591%2526xp%253Did%28%252522m4ce5ae35%252522%29%25252FDIV%25255B1%25255D%25252FDIV%25255B1%25255D%25252FDIV%25255B1%25255D%25252FDIV%25255B1%25255D%25252FH2%25255B1%25255D%25252FA%25255B1%25255D%2526linkType%253D%2526checksum%253D54%26ie%3DUTF-8%26f%3D8%26tn%3Dbaidu%26wd%3D%25E5%25AE%2589%25E5%25B1%2585%25E5%25AE%25A2%26oq%3D%25E5%25AE%2589%25E5%25B1%2585%25E5%25AE%25A2%26rqlang%3Dcn; sessid=CE9A95AF-043B-90B5-A2E4-5F5D39B41EC4; als=0; ctid=41; ANJUKE_BUCKET=pc-home%3AErshou_Web_Home_Home-a; _ga=GA1.2.113488767.1516673325; _gid=GA1.2.255451285.1516673325; __xsptplusUT_8=1; __xsptplus8=8.2.1516678573.1516678593.4%232%7Cbzclk.baidu.com%7C%7C%7C%25E5%25AE%2589%25E5%25B1%2585%25E5%25AE%25A2%7C%23%23249u729XL4J3ZAGKQyEZUyuV4myBLtSZ%23; 58tj_uuid=8a65130f-1085-403a-9e02-5c07dba15641; new_session=0; init_refer=https%253A%252F%252Fnc.zu.anjuke.com%252F%253Ffrom%253Dnavigation; new_uv=2; aQQ_ajkguid=BC9AF129-431B-1C4F-BB91-A27203DE8341; twe=2; Hm_lvt_ed38609fc79dd16e428d5a06610cfeb9=1516673382; Hm_lpvt_ed38609fc79dd16e428d5a06610cfeb9=1516678594",
            "Referer":referer,
            "Upgrade-Insecure-Requests":"1",
            "User-Agent":"Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36"
           }  
    request = urllib.request.Request(url = url, headers = headers)
    soup = BeautifulSoup(urllib.request.urlopen(request,timeout=60).read().decode("utf-8"),"lxml")
    return soup

header = ["名称","房型","面积","层数","中介人","小区","高德地址","高德坐标","wgs坐标","wgs经度","wgs纬度","路段","地址","特点一","特点二","特点三","价格","房源链接"]

# Step1: get and save or read level_1 and level_2 links
links_file = "E:/20180123安居客_南昌租房/links/links.xlsx"
if os.path.exists(links_file):
    workbook_links = xlrd.open_workbook(links_file)
    sheet_links = workbook_links.sheet_by_index(0)
    level2_link = sheet_links.col_values(0)
    level2 = sheet_links.col_values(1)
else:
    # get sub_level1
    url_level_0 = "https://nc.zu.anjuke.com/fangyuan/p1/" # 这个地址是需要根据你所爬取的城市进行修改的
    level1_link =[]
    level1 = []
    for a in soup_form(url_level_0,"https://nc.zu.anjuke.com/").find("div", class_ = "sub-items sub-level1").find_all("a"):
        level1_link.append(a.get("href"))
        level1.append(a.text)
    
    # get sub_level2
    level2_link =[]
    level2 = []
    for i in range(1,len(level1_link)):
        for a in soup_form(level1_link[i],level1_link[i-1]).find("div", class_ = "sub-items sub-level2").find_all("a"):
            if a.text == "全部":
                pass
            else:
                level2_link.append(a.get("href"))        
                level2.append(a.text)
    workbook_links = xlsxwriter.Workbook(links_file)
    sheet_links = workbook_links.add_worksheet("level_2")
    sheet_links.write_column(0, 0, level2_link)
    sheet_links.write_column(0, 1, level2)
    workbook_links.close()
print("Step 1 Done!\nStep 2 Start!")

# Step 2: get every level 2 links' rent information
for j in range(len(level2_link)): # at every level 2 page
    page_index = 1
    for k in range(1,51): # max loop
        # at every page, get the max page, and compare to the current page. if more than current page, continue
        rent_info_file ="E:/20180123安居客_南昌租房/split_data/" + level2[j] + "_info_page" + str(page_index) + ".xlsx" 
        if os.path.exists(rent_info_file):
            page_index += 1
            print(level2[j] + "_info_page" + str(page_index) + ".xlsx already exits")
        else:
            pages = []
            url = level2_link[j] + "p" + str(page_index)
            soup = soup_form(url, level2_link[j])
            try:
                for a in soup.find("div",class_ = "multi-page").find_all("a"):
                    if a.text == "下一页 >" or a.text == "上一页":
                        pass
                    else:
                        pages.append(int(a.text))
            except Exception:
                break
            try:
                max_page = pages[len(pages)-1]
            except Exception:
                max_page = 1
            if page_index < max_page + 2:
                workbook_page = xlsxwriter.Workbook(rent_info_file)
                sheet = workbook_page.add_worksheet("page" + str(page_index))                
                for header_index in range(len(header)):
                    sheet.write(0,header_index,header[header_index])
                row_index = 1    
                for div in soup.find_all("div", class_ = "zu-itemmod"):
                    try:
                        sheet.write(row_index,0,div.find("a").get("title")) # 名称        
                    except Exception:
                        pass 
                    
                    try:
                        sheet.write(row_index,1,div.find("p").text.split("")[0].split("|")[0].replace(" ","").replace("\n","")) # 房型
                    except Exception:
                        pass
                    
                    try:
                        sheet.write(row_index,2,div.find("p").text.split("")[0].split("|")[1]) # 面积
                    except Exception:
                        pass
                    
                    try:
                        sheet.write(row_index,3,div.find("p").text.split("")[0].split("|")[2]) # 层数
                    except Exception:
                        pass 
                    
                    try:
                        sheet.write(row_index,4,div.find("p").text.split("")[1]) # 中介人
                    except Exception:
                        pass
                    
                    try:
                        xiaoqu = div.find("address").text.split()[0]
                        sheet.write(row_index,5,xiaoqu) # 小区
                        url_amap = "http://restapi.amap.com/v3/geocode/geo?address=" + urllib.parse.quote(xiaoqu) + "&output=xml&city=0791&key=【你的key】"
                        soup_amap = BeautifulSoup(urllib.request.urlopen(url_amap).read(),"xml")
                        sheet.write(row_index,6,soup_amap.find("formatted_address").get_text()) # 高德地址
                        location_amap = soup_amap.find("location").get_text()
                        sheet.write(row_index,7,location_amap) # 高德坐标
                        location_wgs = GCJ2WGS(location_amap)
                        longitude = location_wgs[0]
                        latitude = location_wgs[1]
                        sheet.write(row_index,8,str(longitude) + "," + str(latitude)) # wgs坐标
                        sheet.write(row_index,9,longitude)# wgs经度
                        sheet.write(row_index,10,latitude) # wgs纬度 
                    except Exception:
                        pass
            
                    try:
                        sheet.write(row_index,11,div.find("address").text.split()[1]) # 路段
                    except Exception:
                        pass   
                     
                    try:
                        sheet.write(row_index,12,div.find("address").text.split()[2]) # 地址
                    except Exception:
                        pass    
                    try:
                        sheet.write(row_index,13,div.find("span",class_ = "cls-1").text)
                    except Exception:
                        pass
                    
                    try:
                        sheet.write(row_index,14,div.find("span",class_ = "cls-2").text)
                    except Exception:
                        pass
                    
                    try:
                        sheet.write(row_index,15,div.find("span",class_ = "cls-3").text)
                    except Exception:
                        pass
                    
                    try:
                        sheet.write(row_index,16,div.find("strong").text) # 价格
                    except Exception:
                        pass
                    
                    try:
                        sheet.write(row_index,17,div.find("a").get("href")) # 房源链接
                    except Exception:
                        pass
                    row_index += 1
                workbook_page.close()
                print(level2[j] + " page" + str(page_index) + " finished")
                page_index += 1
            else:
                break
    print(level2[j] + "finished")
print("Step 2 Done!\nStep 3 Start!")

# Step 3: merge all xlsx files
xlsx_merge("E:/20180123安居客_南昌租房/split_data/", header, "nanchang_rent_info") 
   
print("All work done")

 

转载于:https://www.cnblogs.com/shadrach/p/8342505.html

  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
### 安居客租房(武汉为例)爬虫+数据分析+可视化 这个爬虫是我前段时间在淘宝上做单子的时候遇见的一个客户需求。本来以为就是一个简单的爬虫项目。但后面客户加了数据清洗和数据分析的要求。而后又加了要详细代码解释的需求等等。直到最后客户坦白说这是他们大专的毕设.......但是这个单子坐下来只有200左右,我想了一下,感觉好亏啊。在淘宝上随便找一个做毕设的都要好多钱的,而且客户本身的代码能力、数学、逻辑能力都很差,导致我每行都给注释以及看不懂,在我交付代码后又纠缠了我一个多礼拜。反正总体做下来的感觉就是烦躁。头一次感觉到了客户需求变更带来的巨大麻烦。 总之这是一次不是很愉快的爬虫经历。但是作为我写爬虫以来注释最详细的一次,以及第一次真正使用像matplotlib这种数据分析库的代码,我认为还是有必要分享出来给大家当个参考的(PS:大佬轻拍~)。爬虫本身几乎没有什么难度,写的也比较乱,敬请见谅。 **功能** 爬取安居客上的出租房信息(武汉地区的),并通过爬取的数据进行数据清洗以及数据分析。给出四个不同层面的可视化图。最终结果如下图所示: ![Image text](https://raw.githubusercontent.com/srx-2000/git_spider/master/anjuke_room_rent_info/result/1.png) ![Image text](https://raw.githubusercontent.com/srx-2000/git_spider/master/anjuke_room_rent_info/result/2.png) ![Image text](https://raw.githubusercontent.com/srx-2000/git_spider/master/anjuke_room_rent_info/result/3.png) ![Image text](https://raw.githubusercontent.com/srx-2000/git_spider/master/anjuke_room_rent_info/result/4.png) **环境** 1. Windows 10 2. python3.7 **使用方法** 首先声明该爬虫由于是特定情况下写的,所以本身的通用性特别差,仅可以对安居客网站上的武汉的出租房信息进行爬取,且需要自己手动更新cookie。同时在对数据进行分析及可视化的时候由于也是特别针对武汉出租房的进行的,所以针对性也比较强。如果别的需求需要自己进行更改。 1. 访问[安居客网址](https://wuhan.anjuke.com/),获取cookie。 > tip:获取cookie的方法可根据[此链接](https://jingyan.baidu.com/article/5d368d1ea6c6e33f60c057ef.html) 2. 在项目中找到`spider.py`的文件,将第12行的cookie换成你自己的cookie。 3. 运行`spider.py`,获取房源信息。运行后应会产生一个`武汉出租房源情况.csv`的文件。此文件为我们从安居客爬取的房源信息,其中包含`房屋租住链接、房屋描述、房屋地址、房屋详情(户型)以及经纪人、房屋价格`五个属性。 4. 在获取了数据之后我们运行`matplotlib.py`文件。进行数据清洗,分析,可视化。运行后即可获得**功能**中展示四个图片。 **技术栈** 1. request 2. parsel 3. pandas 4. matplotlib **进步(相比之前)** 此次爬虫相比之前的技术上可以说有减无增。但其中注释相当详细,可谓是每行代码都有注释。所以对于初学者应该有一些用处。同时使用matplotlib进行了数据分析可视化等。对于数据处理的代码的注释也是几乎每行都有注释的。

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值