1. 找到反爬内容确定字体存放位置
定位到正则匹配
import re
from fontTools.ttLib import TTFont
import requests
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.54 Safari/537.36 Edg/101.0.1210.39"
}
url= 'https://www.maoyan.com/board/1?timeStamp=1652605412237&channelId=40011&index=8&signKey=bace7e65a24e717ead8cf5d252c0a1ac&sVersion=1&webdriver=false'
r= requests.get(url=url,headers=headers).text
with open('猫眼榜单.html','w',encoding='utf8') as f:
f.write(r)
#先把html下载下来便于分析
with open('猫眼榜单.html','r',encoding='utf8') as f:
html = f.read()
#匹配字体文件
woff_url=re.findall('@font-face.*?format.*?url\("(.*?)"\);}',html)[0]
response = requests.get(url = 'http:'+woff_url).content
with open('猫眼榜单.woff','wb') as f:
#下载
f.write(response)
2. 把woff文件保存为xml进行分析字体文件,找到规律
font = TTFont('猫眼榜单.woff')
如果借助xml找不到规律的话,打开woff文件(主要找到字体匹配规则
发现woff字体顺序和cmap对应的顺序是对应的
3. 拼凑数据
num = [9,0,7,3,2,4,6,8,1,5]
cmap = font['cmap'].getBestCmap()
print(cmap)
dict={}
for key,value in cmap.items():
key=hex(key).replace('0x',"&#x")
dict[key]=value
print(dict)
i=0
for key in dict.keys():
if key=='x':
continue
else:
dict[key]=num[i]
i+=1
print(dict)
#得到:
'''{120: 'x', 57628: 'uniE11C', 57708: 'uniE16C', 58724: 'uniE564', 58770: 'uniE592', 59076: 'uniE6C4', 61163: 'uniEEEB', 61546: 'uniF06A', 61913: 'uniF1D9', 61946: 'uniF1FA', 62555: 'uniF45B'}
{'x': 'x', '': 'uniE11C', '': 'uniE16C', '': 'uniE564', '': 'uniE592', '': 'uniE6C4', '': 'uniEEEB', '': 'uniF06A', '': 'uniF1D9', '': 'uniF1FA', '': 'uniF45B'}
{'x': 'x', '': 4, '': 8, '': 2, '': 9, '': 1, '': 3, '': 0, '': 5, '': 6, '': 7}'''
4. 最后替换真实的数据
for key,value in dict.items():
html=html.replace(key+';',str(value))
with open('猫眼榜单解密.html','w',encoding='utf-8')as f:
f.write(html)
这里其实有点取巧,当字体顺序变了,就不适用了,还是要找规则变化