目标网站:闪职数据
方法步骤:
1 、找到字体文件 ;2 、找到替换关系 ;3 、替换爬取下来的数据 处理得到正常的数据。
如何找到字体文件:
- 定位到进行了字体反爬的位置 在对应的styles里面找到font‐family
- 复制font‐family里面的值 去网页源码里面搜索
- 在搜索结果的附近 找到 xxx.ttf 这样的url 进行下载
- 如果需要通过python去读取识别字体文件里面的内容: pip install fontTools ‐i https://pypi.tuna.tsinghua.edu.cn/simple
找到对应关系:
对应关系:
源代码:
闪职.py
import requests
from lxml import etree
from tool import encrypto
from fontTools.ttLib import TTFont
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.82 Safari/537.36',
'Cookie': 'shanzhi_kmer=h9wm0ptr9kcuza527as3a43a6zuwzsid; csrftoken=yHdq0AaPEO1pyiC2zji4MmeyLRNcXVcZLNoHCT3izQ52lwvASvHv0jgsGG9kEXUN'
}
def get_data():
# 目标url
url = 'http://shanzhi.spbeen.com/login/'
# 请求头
res = requests.get(url,headers=header)
data = res.text
return data
def deal_data(text):
tree = etree.HTML(text)
# 获取csrfmiddlewaretoken
csrfmiddlewaretoken = tree.xpath('//input[@name="csrfmiddlewaretoken"]/@value')[0]
# 取pk
pk = tree.xpath('//input[@id = "pk"]/@value')[0]
return csrfmiddlewaretoken,pk
def rebuilt(pk,csrfmiddlewaretoken):
# 老密码
old_pwd = 'logic_00'
# 加密后的密码
new_pwd = encrypto(pk,old_pwd)
# 构造表单数据
data_dict = {
"username":'logic_00',
"password": new_pwd,
"csrfmiddlewaretoken":csrfmiddlewaretoken
}
url = 'http://shanzhi.spbeen.com/login/'
res = requests.post(url,data=data_dict,headers=header)
return res.text
def deal_ttf(html):
sz = TTFont('szec.ttf')
re_dict = sz.getBestCmap()
# discover what the old dict have
# print(re_dict)
# 数字字体的对应关系 dict
font_dict = {}
for k,v in re_dict.items():
# put k into Hexadecimal
k = '&#x' + hex(k)[2::] + ';'
# find 'v' reflection
v = int(v[-2:]) - 1
# recover the old
font_dict[k] = str(v)
# print(font_dict)
# replace the crawl font
for k ,v in font_dict.items():
html = html.replace(k,v)
print(html)
if __name__ == '__main__':
text = get_data()
csrfmiddlewaretoken, pk = deal_data(text)
# old_code
html = rebuilt(pk,csrfmiddlewaretoken)
# use ttf principal deal with the old code into the new code
deal_ttf(html)
tool.py
# pip install pycryptodome -i https://pypi.tuna.tsinghua.edu.cn/simple
from Crypto.PublicKey import RSA
from Crypto.Cipher import PKCS1_v1_5 as cry_pksc1_v1_5
import base64
def encrypto(pk, password):
"""
使用公钥对密码进行加密处理
:param pk: 公钥
:param password: 明文密码
:return: RAS加密之后的密码
"""
public_key = "-----BEGIN PUBLIC KEY-----\n{}\n-----END PUBLIC KEY-----".format(pk)
# 导入公钥 返回一个RSA秘钥对象
rsakey = RSA.importKey(public_key)
# 对需要加密的内容进行PKCS#1 v1.5加密
cipher = cry_pksc1_v1_5.new(rsakey)
# 使用公钥加密密码 密码必须是二进制
miwen_encode = cipher.encrypt(password.encode())
# 再使用Base64对类似字节的对象进行编码
cipher_text = base64.b64encode(miwen_encode).decode()
return cipher_text
总结:注意找到字体的对应关系然后进行爬取