Python爬取安居客(base64加密)

测试base64加密:http://tool.chinaz.com/Tools/Base64.aspx

基本原理:https://www.cnblogs.com/hongru/archive/2012/01/14/2321397.html

一、分析url

访问网址:https://bj.zu.anjuke.com/
在这里插入图片描述
发现字体部分是加密得到的,可以猜想到大概是css加密,尝试查看它的字体。
在这里插入图片描述
去style中找下这个字体的来源
在这里插入图片描述
查看自定义字体的格式,如下所示:

@font-face {
    font-family: "PingFangSC-Regular-address";
    src: url("//s3plus.meituan.net/v1/mss_73a511b8f91f43d0bdae92584ea6330b/font/5a43c7ad.eot");
    src: url("//s3plus.meituan.net/v1/mss_73a511b8f91f43d0bdae92584ea6330b/font/5a43c7ad.eot?#iefix") format("embedded-opentype"),url("//s3plus.meituan.net/v1/mss_73a511b8f91f43d0bdae92584ea6330b/font/5a43c7ad.woff");
}

.address {
    font-family: 'PingFangSC-Regular-address';
}

发现,src:url(“字体的地址”),其实base64也可以将数据加密,直接使用"data:加密后的数据",这里的style分析发现,“data:application/font-ttf;charset=utf-8;base64,使用base64加密的数据”,这里可以通过正则找到数据。

@font-face{font-family:'fangchan-secret';src:url('data:application/font-ttf;charset=utf-8;base64,AAEAAAALAIAAAwAwR1NVQiCLJXoAAAE4AAAAVE9TLzL4XQjtAAABjAAAAFZjbWFwq8J/ZQAAAhAAAAIuZ2x5ZuWIN0cAAARYAAADdGhlYWQa3IzLAAAA4AAAADZoaGVhCtADIwAAALwAAAAkaG10eC7qAAAAAAHkAAAALGxvY2ED7gSyAAAEQAAAABhtYXhwARgANgAAARgAAAAgbmFtZTd6VP8AAAfMAAACanBvc3QEQwahAAAKOAAAAEUAAQAABmb+ZgAABLEAAAAABGgAAQAAAAAAAAAAAAAAAAAAAAsAAQAAAAEAAN8qfwZfDzz1AAsIAAAAAADbqqDdAAAAANuqoN0AAP/mBGgGLgAAAAgAAgAAAAAAAAABAAAACwAqAAMAAAAAAAIAAAAKAAoAAAD/AAAAAAAAAAEAAAAKADAAPgACREZMVAAObGF0bgAaAAQAAAAAAAAAAQAAAAQAAAAAAAAAAQAAAAFsaWdhAAgAAAABAAAAAQAEAAQAAAABAAgAAQAGAAAAAQAAAAEERAGQAAUAAAUTBZkAAAEeBRMFmQAAA9cAZAIQAAACAAUDAAAAAAAAAAAAAAAAAAAAAAAAAAAAAFBmRWQAQJR2n6UGZv5mALgGZgGaAAAAAQAAAAAAAAAAAAAEsQAABLEAAASxAAAEsQAABLEAAASxAAAEsQAABLEAAASxAAAEsQAAAAAABQAAAAMAAAAsAAAABAAAAaYAAQAAAAAAoAADAAEAAAAsAAMACgAAAaYABAB0AAAAFAAQAAMABJR2lY+ZPJpLnjqeo59kn5Kfpf//AACUdpWPmTyaS546nqOfZJ+Sn6T//wAAAAAAAAAAAAAAAAAAAAAAAAABABQAFAAUABQAFAAUABQAFAAUAAAACQAIAAUAAgAHAAQABgABAAMACgAAAQYAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAADAAAAAAAiAAAAAAAAAAKAACUdgAAlHYAAAAJAACVjwAAlY8AAAAIAACZPAAAmTwAAAAFAACaSwAAmksAAAACAACeOgAAnjoAAAAHAACeowAAnqMAAAAEAACfZAAAn2QAAAAGAACfkgAAn5IAAAABAACfpAAAn6QAAAADAACfpQAAn6UAAAAKAAAAAAAAACgAPgBmAJoAvgDoASQBOAF+AboAAgAA/+YEWQYnAAoAEgAAExAAISAREAAjIgATECEgERAhIFsBEAECAez+6/rs/v3IATkBNP7S/sEC6AGaAaX85v54/mEBigGB/ZcCcwKJAAABAAAAAAQ1Bi4ACQAAKQE1IREFNSURIQQ1/IgBW/6cAicBWqkEmGe0oPp7AAEAAAAABCYGJwAXAAApATUBPgE1NCYjIgc1NjMyFhUUAgcBFSEEGPxSAcK6fpSMz7y389Hym9j+nwLGqgHButl0hI2wx43iv5D+69b+pwQAAQAA/+YEGQYnACEAABMWMzI2NRAhIzUzIBE0ISIHNTYzMhYVEAUVHgEVFAAjIiePn8igu/5bgXsBdf7jo5CYy8bw/sqow/7T+tyHAQN7nYQBJqIBFP9uuVjPpf7QVwQSyZbR/wBSAAACAAAAAARoBg0ACgASAAABIxEjESE1ATMRMyERNDcjBgcBBGjGvv0uAq3jxv58BAQOLf4zAZL+bgGSfwP8/CACiUVaJlH9TwABAAD/5gQhBg0AGAAANxYzMjYQJiMiBxEhFSERNjMyBBUUACEiJ7GcqaDEx71bmgL6/bxXLPUBEv7a/v3Zbu5mswEppA4DE63+SgX42uH+6kAAAAACAAD/5gRbBicAFgAiAAABJiMiAgMzNjMyEhUUACMiABEQACEyFwEUFjMyNjU0JiMiBgP6eYTJ9AIFbvHJ8P7r1+z+8wFhASClXv1Qo4eAoJeLhKQFRj7+ov7R1f762eP+3AFxAVMBmgHjLfwBmdq8lKCytAAAAAABAAAAAARNBg0ABgAACQEjASE1IQRN/aLLAkD8+gPvBcn6NwVgrQAAAwAA/+YESgYnABUAHwApAAABJDU0JDMyFhUQBRUEERQEIyIkNRAlATQmIyIGFRQXNgEEFRQWMzI2NTQBtv7rAQTKufD+3wFT/un6zf7+AUwBnIJvaJLz+P78/uGoh4OkAy+B9avXyqD+/osEev7aweXitAEohwF7aHh9YcJlZ/7qdNhwkI9r4QAAAAACAAD/5gRGBicAFwAjAAA3FjMyEhEGJwYjIgA1NAAzMgAREAAhIicTFBYzMjY1NCYjIga5gJTQ5QICZvHD/wABGN/nAQT+sP7Xo3FxoI16pqWHfaTSSgFIAS4CAsIBDNbkASX+lf6l/lP+MjUEHJy3p3en274AAAAAABAAxgABAAAAAAABAA8AAAABAAAAAAACAAcADwABAAAAAAADAA8AFgABAAAAAAAEAA8AJQABAAAAAAAFAAsANAABAAAAAAAGAA8APwABAAAAAAAKACsATgABAAAAAAALABMAeQADAAEECQABAB4AjAADAAEECQACAA4AqgADAAEECQADAB4AuAADAAEECQAEAB4A1gADAAEECQAFABYA9AADAAEECQAGAB4BCgADAAEECQAKAFYBKAADAAEECQALACYBfmZhbmdjaGFuLXNlY3JldFJlZ3VsYXJmYW5nY2hhbi1zZWNyZXRmYW5nY2hhbi1zZWNyZXRWZXJzaW9uIDEuMGZhbmdjaGFuLXNlY3JldEdlbmVyYXRlZCBieSBzdmcydHRmIGZyb20gRm9udGVsbG8gcHJvamVjdC5odHRwOi8vZm9udGVsbG8uY29tAGYAYQBuAGcAYwBoAGEAbgAtAHMAZQBjAHIAZQB0AFIAZQBnAHUAbABhAHIAZgBhAG4AZwBjAGgAYQBuAC0AcwBlAGMAcgBlAHQAZgBhAG4AZwBjAGgAYQBuAC0AcwBlAGMAcgBlAHQAVgBlAHIAcwBpAG8AbgAgADEALgAwAGYAYQBuAGcAYwBoAGEAbgAtAHMAZQBjAHIAZQB0AEcAZQBuAGUAcgBhAHQAZQBkACAAYgB5ACAAcwB2AGcAMgB0AHQAZgAgAGYAcgBvAG0AIABGAG8AbgB0AGUAbABsAG8AIABwAHIAbwBqAGUAYwB0AC4AaAB0AHQAcAA6AC8ALwBmAG8AbgB0AGUAbABsAG8ALgBjAG8AbQAAAAIAAAAAAAD/EwB3AAAAAAAAAAAAAAAAAAAAAAAAAAAACwECAQMBBAEFAQYBBwEIAQkBCgELAQwAAAAAAAAAAAAAAAAAAAAA') format('truetype')}.strongbox{font-family:'fangchan-secret','Hiragino Sans GB','Microsoft yahei',Arial,sans-serif,'宋体'!important}

接下来发送请求,获取数据,提取base64数据

import requests

url = "https://bj.zu.anjuke.com/"
headers = {
    "user-agent": "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36",
}

response = requests.get(url,headers=headers)
html = response.content.decode("utf-8")
print(html)

在这里插入图片描述
发现style中的字体是通过js来写的,这个不影响正则的提取,提取之后,使用base64解密,然后保存成ttf文件,使用fontcreator打开查看:

import requests
import re
import base64

url = "https://bj.zu.anjuke.com/"
headers = {
    "user-agent": "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36",
}

response = requests.get(url,headers=headers)
html = response.content.decode("utf-8")

data1 = re.findall(r"base64,(.*?)'\)",html,re.S)[0]
print(data1)

data2 = base64.b64decode(data1)
print(data2)

with open("./anjuke.ttf","wb") as file:
    file.write(data2)

在这里插入图片描述
再运行一次,再查看,对比:
在这里插入图片描述
发现上面的编号每次是不同的,当然内容是一样的,11个内容。
接着,使用fonttools工具读取ttf,获取编号和对应信息。

import requests
import re
import base64
from io import BytesIO
from fontTools.ttLib import TTFont

url = "https://bj.zu.anjuke.com/"
headers = {
    "user-agent": "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36",
}

response = requests.get(url,headers=headers)
html = response.content.decode("utf-8")

data1 = re.findall(r"base64,(.*?)'\)",html,re.S)[0]

#base64解密
data2 = base64.b64decode(data1)

# with open("./anjuke.ttf","wb") as file:
#     file.write(data2)

#字节读取
data3 = BytesIO(data2)
#读取字体
font = TTFont(data3)
#打印字体和对应
print(font.getGlyphOrder())
print(font.getBestCmap())
['glyph00000', 'glyph00001', 'glyph00002', 'glyph00003', 'glyph00004', 'glyph00005', 'glyph00006', 'glyph00007', 'glyph00008', 'glyph00009', 'glyph00010']
{38006: 'glyph00005', 38287: 'glyph00003', 39228: 'glyph00006', 39499: 'glyph00009', 40506: 'glyph00008', 40611: 'glyph00001', 40804: 'glyph00010', 40850: 'glyph00004', 40868: 'glyph00002', 40869: 'glyph00007'}

发现规律:
'glyph00001‘对应的是数字0,'glyph00002’对应数字1…
38006是10进制,而使用ttf文件中上面的键是uni+16进制,这里将16和10进制进行转换就可以了。
在这里插入图片描述

二、代码实现

思路:

(1)向https://bj.zu.anjuke.com/发送请求获取html数据
(2)提取base64加密后的数据,base64解码
(3)使用fonttool读取字体

从html数据中获取加密的数据,在自定义字体中获取原文字

import requests
import re
import base64
import csv
from io import BytesIO
from fontTools.ttLib import TTFont
from lxml import etree


class AnJuKeSpider:
    def __init__(self, url):
        self.url = url
        self.headers = {
            "user-agent": "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36",
        }
        self.html = ""
        self.font_dict = {}

    def parse_url(self, url, headers, params={}):
        """解析url,返回html"""
        response = requests.get(url, headers=headers, params=params)
        return response.content.decode("utf-8")

    def parse_xpath(self, html):
        """使用xpath解析html,返回xpath对象"""
        etree_obj = etree.HTML(html)
        return etree_obj

    def get_font_dict(self, html):
        """获取字典 {编号:文字}"""
        # 正则提取
        data1 = re.findall(r"base64,(.*?)'\)", html, re.S)[0]
        # base64解密
        data2 = base64.b64decode(data1)
        # 字节读取
        data3 = BytesIO(data2)
        # 读取字体
        font = TTFont(data3)
        # 打印字体和对应
        data4 = font.getBestCmap()
        # 返回数据
        return {hex(k)[2:]: str(int(v[5:].lstrip("0")) - 1) for k, v in data4.items()}

    def parse_font(self, string):
        """获取对应的字体"""
        return re.sub(r'(\*[a-z0-9]+?\*)',lambda x:self.font_dict[x.group(1).strip("*")],string)

    def start(self):
        """主程序"""
        self.html = self.parse_url(url=self.url,headers=self.headers)
        self.font_dict = self.get_font_dict(html=self.html)
        # 替换特殊字符,避免产生乱码一样的内容
        self.html = re.sub(r"&#x(\w+?);", r"*\1*", self.html)
        #使用xpath解析
        xpath_obj = self.parse_xpath(html=self.html)
        div_list = xpath_obj.xpath('//div[@class="zu-itemmod"]')
        for div in div_list:
            item = {}
            item["title"] = self.parse_font(div.xpath("./div[1]/h3/a/b/text()")[0])
            item["price"] = self.parse_font(div.xpath("./div[2]/p/strong/b/text()")[0])
            self.save(item)

    def save(self,item):
        """将数据保存到csv中"""
        print("{}保存中...".format(item))
        with open("./安居客.csv", "a", encoding="utf-8") as file:
            writer = csv.writer(file)
            writer.writerow(item.values())

if __name__ == '__main__':
    url = "https://bj.zu.anjuke.com/"
    AnJuKeSpider(url=url).start()

三、运行结果

在这里插入图片描述

评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值