测试base64加密:http://tool.chinaz.com/Tools/Base64.aspx
基本原理:https://www.cnblogs.com/hongru/archive/2012/01/14/2321397.html
一、分析url
访问网址:https://bj.zu.anjuke.com/
发现字体部分是加密得到的,可以猜想到大概是css加密,尝试查看它的字体。
去style中找下这个字体的来源
查看自定义字体的格式,如下所示:
@font-face {
font-family: "PingFangSC-Regular-address";
src: url("//s3plus.meituan.net/v1/mss_73a511b8f91f43d0bdae92584ea6330b/font/5a43c7ad.eot");
src: url("//s3plus.meituan.net/v1/mss_73a511b8f91f43d0bdae92584ea6330b/font/5a43c7ad.eot?#iefix") format("embedded-opentype"),url("//s3plus.meituan.net/v1/mss_73a511b8f91f43d0bdae92584ea6330b/font/5a43c7ad.woff");
}
.address {
font-family: 'PingFangSC-Regular-address';
}
发现,src:url(“字体的地址”),其实base64也可以将数据加密,直接使用"data:加密后的数据",这里的style分析发现,“data:application/font-ttf;charset=utf-8;base64,使用base64加密的数据”,这里可以通过正则找到数据。
@font-face{font-family:'fangchan-secret';src:url('data:application/font-ttf;charset=utf-8;base64,AAEAAAALAIAAAwAwR1NVQiCLJXoAAAE4AAAAVE9TLzL4XQjtAAABjAAAAFZjbWFwq8J/ZQAAAhAAAAIuZ2x5ZuWIN0cAAARYAAADdGhlYWQa3IzLAAAA4AAAADZoaGVhCtADIwAAALwAAAAkaG10eC7qAAAAAAHkAAAALGxvY2ED7gSyAAAEQAAAABhtYXhwARgANgAAARgAAAAgbmFtZTd6VP8AAAfMAAACanBvc3QEQwahAAAKOAAAAEUAAQAABmb+ZgAABLEAAAAABGgAAQAAAAAAAAAAAAAAAAAAAAsAAQAAAAEAAN8qfwZfDzz1AAsIAAAAAADbqqDdAAAAANuqoN0AAP/mBGgGLgAAAAgAAgAAAAAAAAABAAAACwAqAAMAAAAAAAIAAAAKAAoAAAD/AAAAAAAAAAEAAAAKADAAPgACREZMVAAObGF0bgAaAAQAAAAAAAAAAQAAAAQAAAAAAAAAAQAAAAFsaWdhAAgAAAABAAAAAQAEAAQAAAABAAgAAQAGAAAAAQAAAAEERAGQAAUAAAUTBZkAAAEeBRMFmQAAA9cAZAIQAAACAAUDAAAAAAAAAAAAAAAAAAAAAAAAAAAAAFBmRWQAQJR2n6UGZv5mALgGZgGaAAAAAQAAAAAAAAAAAAAEsQAABLEAAASxAAAEsQAABLEAAASxAAAEsQAABLEAAASxAAAEsQAAAAAABQAAAAMAAAAsAAAABAAAAaYAAQAAAAAAoAADAAEAAAAsAAMACgAAAaYABAB0AAAAFAAQAAMABJR2lY+ZPJpLnjqeo59kn5Kfpf//AACUdpWPmTyaS546nqOfZJ+Sn6T//wAAAAAAAAAAAAAAAAAAAAAAAAABABQAFAAUABQAFAAUABQAFAAUAAAACQAIAAUAAgAHAAQABgABAAMACgAAAQYAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAADAAAAAAAiAAAAAAAAAAKAACUdgAAlHYAAAAJAACVjwAAlY8AAAAIAACZPAAAmTwAAAAFAACaSwAAmksAAAACAACeOgAAnjoAAAAHAACeowAAnqMAAAAEAACfZAAAn2QAAAAGAACfkgAAn5IAAAABAACfpAAAn6QAAAADAACfpQAAn6UAAAAKAAAAAAAAACgAPgBmAJoAvgDoASQBOAF+AboAAgAA/+YEWQYnAAoAEgAAExAAISAREAAjIgATECEgERAhIFsBEAECAez+6/rs/v3IATkBNP7S/sEC6AGaAaX85v54/mEBigGB/ZcCcwKJAAABAAAAAAQ1Bi4ACQAAKQE1IREFNSURIQQ1/IgBW/6cAicBWqkEmGe0oPp7AAEAAAAABCYGJwAXAAApATUBPgE1NCYjIgc1NjMyFhUUAgcBFSEEGPxSAcK6fpSMz7y389Hym9j+nwLGqgHButl0hI2wx43iv5D+69b+pwQAAQAA/+YEGQYnACEAABMWMzI2NRAhIzUzIBE0ISIHNTYzMhYVEAUVHgEVFAAjIiePn8igu/5bgXsBdf7jo5CYy8bw/sqow/7T+tyHAQN7nYQBJqIBFP9uuVjPpf7QVwQSyZbR/wBSAAACAAAAAARoBg0ACgASAAABIxEjESE1ATMRMyERNDcjBgcBBGjGvv0uAq3jxv58BAQOLf4zAZL+bgGSfwP8/CACiUVaJlH9TwABAAD/5gQhBg0AGAAANxYzMjYQJiMiBxEhFSERNjMyBBUUACEiJ7GcqaDEx71bmgL6/bxXLPUBEv7a/v3Zbu5mswEppA4DE63+SgX42uH+6kAAAAACAAD/5gRbBicAFgAiAAABJiMiAgMzNjMyEhUUACMiABEQACEyFwEUFjMyNjU0JiMiBgP6eYTJ9AIFbvHJ8P7r1+z+8wFhASClXv1Qo4eAoJeLhKQFRj7+ov7R1f762eP+3AFxAVMBmgHjLfwBmdq8lKCytAAAAAABAAAAAARNBg0ABgAACQEjASE1IQRN/aLLAkD8+gPvBcn6NwVgrQAAAwAA/+YESgYnABUAHwApAAABJDU0JDMyFhUQBRUEERQEIyIkNRAlATQmIyIGFRQXNgEEFRQWMzI2NTQBtv7rAQTKufD+3wFT/un6zf7+AUwBnIJvaJLz+P78/uGoh4OkAy+B9avXyqD+/osEev7aweXitAEohwF7aHh9YcJlZ/7qdNhwkI9r4QAAAAACAAD/5gRGBicAFwAjAAA3FjMyEhEGJwYjIgA1NAAzMgAREAAhIicTFBYzMjY1NCYjIga5gJTQ5QICZvHD/wABGN/nAQT+sP7Xo3FxoI16pqWHfaTSSgFIAS4CAsIBDNbkASX+lf6l/lP+MjUEHJy3p3en274AAAAAABAAxgABAAAAAAABAA8AAAABAAAAAAACAAcADwABAAAAAAADAA8AFgABAAAAAAAEAA8AJQABAAAAAAAFAAsANAABAAAAAAAGAA8APwABAAAAAAAKACsATgABAAAAAAALABMAeQADAAEECQABAB4AjAADAAEECQACAA4AqgADAAEECQADAB4AuAADAAEECQAEAB4A1gADAAEECQAFABYA9AADAAEECQAGAB4BCgADAAEECQAKAFYBKAADAAEECQALACYBfmZhbmdjaGFuLXNlY3JldFJlZ3VsYXJmYW5nY2hhbi1zZWNyZXRmYW5nY2hhbi1zZWNyZXRWZXJzaW9uIDEuMGZhbmdjaGFuLXNlY3JldEdlbmVyYXRlZCBieSBzdmcydHRmIGZyb20gRm9udGVsbG8gcHJvamVjdC5odHRwOi8vZm9udGVsbG8uY29tAGYAYQBuAGcAYwBoAGEAbgAtAHMAZQBjAHIAZQB0AFIAZQBnAHUAbABhAHIAZgBhAG4AZwBjAGgAYQBuAC0AcwBlAGMAcgBlAHQAZgBhAG4AZwBjAGgAYQBuAC0AcwBlAGMAcgBlAHQAVgBlAHIAcwBpAG8AbgAgADEALgAwAGYAYQBuAGcAYwBoAGEAbgAtAHMAZQBjAHIAZQB0AEcAZQBuAGUAcgBhAHQAZQBkACAAYgB5ACAAcwB2AGcAMgB0AHQAZgAgAGYAcgBvAG0AIABGAG8AbgB0AGUAbABsAG8AIABwAHIAbwBqAGUAYwB0AC4AaAB0AHQAcAA6AC8ALwBmAG8AbgB0AGUAbABsAG8ALgBjAG8AbQAAAAIAAAAAAAD/EwB3AAAAAAAAAAAAAAAAAAAAAAAAAAAACwECAQMBBAEFAQYBBwEIAQkBCgELAQwAAAAAAAAAAAAAAAAAAAAA') format('truetype')}.strongbox{font-family:'fangchan-secret','Hiragino Sans GB','Microsoft yahei',Arial,sans-serif,'宋体'!important}
接下来发送请求,获取数据,提取base64数据
import requests
url = "https://bj.zu.anjuke.com/"
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36",
}
response = requests.get(url,headers=headers)
html = response.content.decode("utf-8")
print(html)
发现style中的字体是通过js来写的,这个不影响正则的提取,提取之后,使用base64解密,然后保存成ttf文件,使用fontcreator打开查看:
import requests
import re
import base64
url = "https://bj.zu.anjuke.com/"
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36",
}
response = requests.get(url,headers=headers)
html = response.content.decode("utf-8")
data1 = re.findall(r"base64,(.*?)'\)",html,re.S)[0]
print(data1)
data2 = base64.b64decode(data1)
print(data2)
with open("./anjuke.ttf","wb") as file:
file.write(data2)
再运行一次,再查看,对比:
发现上面的编号每次是不同的,当然内容是一样的,11个内容。
接着,使用fonttools工具读取ttf,获取编号和对应信息。
import requests
import re
import base64
from io import BytesIO
from fontTools.ttLib import TTFont
url = "https://bj.zu.anjuke.com/"
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36",
}
response = requests.get(url,headers=headers)
html = response.content.decode("utf-8")
data1 = re.findall(r"base64,(.*?)'\)",html,re.S)[0]
#base64解密
data2 = base64.b64decode(data1)
# with open("./anjuke.ttf","wb") as file:
# file.write(data2)
#字节读取
data3 = BytesIO(data2)
#读取字体
font = TTFont(data3)
#打印字体和对应
print(font.getGlyphOrder())
print(font.getBestCmap())
['glyph00000', 'glyph00001', 'glyph00002', 'glyph00003', 'glyph00004', 'glyph00005', 'glyph00006', 'glyph00007', 'glyph00008', 'glyph00009', 'glyph00010']
{38006: 'glyph00005', 38287: 'glyph00003', 39228: 'glyph00006', 39499: 'glyph00009', 40506: 'glyph00008', 40611: 'glyph00001', 40804: 'glyph00010', 40850: 'glyph00004', 40868: 'glyph00002', 40869: 'glyph00007'}
发现规律:
'glyph00001‘对应的是数字0,'glyph00002’对应数字1…
38006是10进制,而使用ttf文件中上面的键是uni+16进制,这里将16和10进制进行转换就可以了。
二、代码实现
思路:
(1)向https://bj.zu.anjuke.com/发送请求获取html数据
(2)提取base64加密后的数据,base64解码
(3)使用fonttool读取字体
从html数据中获取加密的数据,在自定义字体中获取原文字
import requests
import re
import base64
import csv
from io import BytesIO
from fontTools.ttLib import TTFont
from lxml import etree
class AnJuKeSpider:
def __init__(self, url):
self.url = url
self.headers = {
"user-agent": "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36",
}
self.html = ""
self.font_dict = {}
def parse_url(self, url, headers, params={}):
"""解析url,返回html"""
response = requests.get(url, headers=headers, params=params)
return response.content.decode("utf-8")
def parse_xpath(self, html):
"""使用xpath解析html,返回xpath对象"""
etree_obj = etree.HTML(html)
return etree_obj
def get_font_dict(self, html):
"""获取字典 {编号:文字}"""
# 正则提取
data1 = re.findall(r"base64,(.*?)'\)", html, re.S)[0]
# base64解密
data2 = base64.b64decode(data1)
# 字节读取
data3 = BytesIO(data2)
# 读取字体
font = TTFont(data3)
# 打印字体和对应
data4 = font.getBestCmap()
# 返回数据
return {hex(k)[2:]: str(int(v[5:].lstrip("0")) - 1) for k, v in data4.items()}
def parse_font(self, string):
"""获取对应的字体"""
return re.sub(r'(\*[a-z0-9]+?\*)',lambda x:self.font_dict[x.group(1).strip("*")],string)
def start(self):
"""主程序"""
self.html = self.parse_url(url=self.url,headers=self.headers)
self.font_dict = self.get_font_dict(html=self.html)
# 替换特殊字符,避免产生乱码一样的内容
self.html = re.sub(r"&#x(\w+?);", r"*\1*", self.html)
#使用xpath解析
xpath_obj = self.parse_xpath(html=self.html)
div_list = xpath_obj.xpath('//div[@class="zu-itemmod"]')
for div in div_list:
item = {}
item["title"] = self.parse_font(div.xpath("./div[1]/h3/a/b/text()")[0])
item["price"] = self.parse_font(div.xpath("./div[2]/p/strong/b/text()")[0])
self.save(item)
def save(self,item):
"""将数据保存到csv中"""
print("{}保存中...".format(item))
with open("./安居客.csv", "a", encoding="utf-8") as file:
writer = csv.writer(file)
writer.writerow(item.values())
if __name__ == '__main__':
url = "https://bj.zu.anjuke.com/"
AnJuKeSpider(url=url).start()