《封号码罗》python爬虫之某同城字体反爬（十二）

最新推荐文章于 2023-04-27 11:18:24 发布

Python 键盘上的舞者

最新推荐文章于 2023-04-27 11:18:24 发布

阅读量604

点赞数

分类专栏：经验分享爬虫文章标签： python xpath xml url 字体反爬

本文链接：https://blog.csdn.net/Python_DJ/article/details/114901842

版权

经验分享同时被 2 个专栏收录

52 篇文章 10 订阅

订阅专栏

爬虫

43 篇文章 20 订阅

订阅专栏

from xml import etree

import requests
import re
import base64
from fontTools.ttLib import TTFont, BytesIO

headers = {
    "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
    "accept-encoding": "gzip, deflate, br",
    "accept-language": "zh-CN,zh;q=0.9",
    "cache-control": "max-age=0",
    # 拿cookie是因为现在需要登陆状态才行
    "cookie": 'f=n; commontopbar_new_city_info=5%7C%E8%8B%8F%E5%B7%9E%7Csu; id58=UvTEUl/zCqajUJaybRC9bw==; 58tj_uuid=7009bbe1-76f8-4b73-8376-1f7a69d5f496; als=0; wmda_new_uuid=1; wmda_uuid=c67559bfe142299dbd374d9dab028df8; xxzl_deviceid=KaV0c6cV6yrVONO9yTVEfuxpPimKScDGVOLYxOl3UXSNS5GD%2BMfPxb30M3qNp5Fi; xxzl_smartid=e80e9b7ddd64b4b4d1a2cd8041cebe27; gr_user_id=b371d8ff-2cf7-43ca-9097-e7dd72fc6224; xxzl_cid=e61ae401e08e40388d40a8e34b29956a; xzuid=7f33cba3-b8fd-43f1-839e-fa2c1cc9cafa; Hm_lvt_3bb04d7a4ca3846dcc66a99c3e861511=1610121167,1610166089,1610166404,1610369641; Hm_lvt_e15962162366a86a6229038443847be7=1610121167,1610166090,1610166405,1610369641; Hm_lvt_e2d6b2d0ec536275bb1e37b421085803=1610120643,1610166104,1610369657; ppStore_fingerprint=43A259D40092AFB43E772FCC9350C463700EAEBE5D26276C%EF%BC%BF1610369657573; 58home=su; f=n; commontopbar_new_city_info=5%7C%E8%8B%8F%E5%B7%9E%7Csu; city=su; new_uv=6; utm_source=12345; spm=164128813067.zhaopin_baidu; wmda_session_id_1731916484865=1615820398660-8edb2386-ae73-b2ed; wmda_visited_projects=%3B11187958619315%3B10104579731767%3B7790950805815%3B3381039819650%3B1409632296065%3B1731916484865; commontopbar_ipcity=su%7C%E8%8B%8F%E5%B7%9E%7C0; sessionid=99a3f5e1-24e3-4673-9285-97c9ffdd3501; param8716kop=1; wmda_session_id_10104579731767=1615820515565-2d473dec-cfe9-e637; new_session=0; init_refer=; crmvip=""; dk_cookie=""; isSmartSortTipShowed=true; PPU="UID=57606948143127&UN=%E9%83%AD%E6%88%90%E4%BC%9F18567839070&TT=393add6da5f4879dfc0ba40e1c64c563&PBODY=cdiJiPxHFQ_N7xQnaMGLnXGVihtEYl982at6jeFRhTVsWMOJY2Lsy9f_XmThf5RZgWrPwQLMw33MnptMvXSDwDNlQsMGZAcHd4iBcopZtNFHDI_n_0cswApG1qXvW9Ksb81wcjYnrbyslFqGz6Yi4-O0jUZQCJXbnY1tCMafLjE&VER=1"; www58com="UserID=57606948143127&UserName=%E9%83%AD%E6%88%90%E4%BC%9F18567839070"; 58cooper="userid=57606948143127&username=%E9%83%AD%E6%88%90%E4%BC%9F18567839070"; 58uname=%E9%83%AD%E6%88%90%E4%BC%9F18567839070; Hm_lvt_a3013634de7e7a5d307653e15a0584cf=1615820690,1615820708,1615820829; param8616=0; JSESSIONID=A819D6FD17545AB36C96E42D3923EA40; jl_list_left_banner=11; Hm_lpvt_a3013634de7e7a5d307653e15a0584cf=1615821167',
    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36",
}
# x的差和y的差不会变，根据这两个的值组成字典的key，来映射真是数据
data_map = {
    (0, 1549): "B", # 这里省略，需要一个字一个字的去对照计算
}
url = "https://su.58.com/searchjob/"
response = requests.get(url=url, headers=headers).text
result = re.search(r"base64,(.*?)\)", response, flags=re.S).group(1)
woff = base64.b64decode(result)
tf = TTFont(BytesIO(woff))
# print(tf.getGlyphNames())
# ['glyph00000', 'uniE025', 'uniE0CD', 'uniE11F', 'uniE1BE', 'uniE2D2', 'uniE311', 'uniE343',
# 'uniE423', 'uniE4CA', 'uniE5BE', 'uniE602', 'uniE6EA', 'uniE82B', 'uniE82C', 'uniE8AE',
# 'uniE8CA', 'uniE953', 'uniE99B', 'uniE9FE', 'uniEA35', 'uniEADF', 'uniEB8B', 'uniEBBA',
# 'uniEC2B', 'uniEC6B', 'uniEDF0', 'uniEEC8', 'uniEEE0', 'uniEF34', 'uniEFCC', 'uniEFF8',
# 'uniF217', 'uniF291', 'uniF2BF', 'uniF32A', 'uniF364', 'uniF398', 'uniF477', 'uniF584',
# 'uniF5A1', 'uniF5A2', 'uniF5B9', 'uniF5E9', 'uniF62C', 'uniF83E', 'x']
# print(woff)
with open("ztku01.woff", "wb") as f:
    f.write(woff)
fonts = TTFont("ztku01.woff")
fonts.saveXML("ztku01.xml")  # 借助TTFont库把字体转换成xml格式的文件
font_map = dict()
for i in tf.getGlyphNames()[1:-1]:
    # print(i)  # uniE025
    # print(tf["glyf"])  # <'glyf' table at 18ca4419b20>
    # print(tf["glyf"][i])  # <fontTools.ttLib.tables._g_l_y_f.Glyph object at 0x000002165B9838B0>
    # print(tf["glyf"][i].coordinates)  # GlyphCoordinates([(1990, 244),(1960, 205),...)
    # print(tf["glyf"][i].coordinates[0], tf["glyf"][i].coordinates[1])  # (2002, 277) (1929, 188)
    temp = tf["glyf"][i].coordinates
    x1, y1 = temp[0]
    x2, y2 = temp[1]
    position = (x2 - x1, y2 - y1)
    # print(position)  # (-73, -89)
    key = str(i).replace("uni", "&#x").lower()
    # print(key)  # &#xe10b 网页源码字符串
    # 以xe10b为键    以位置获取字典中的真是汉字
    font_map[key] = data_map[position]
html = ""
for i in font_map:
    print(i + ";")
    html = response.replace(i + ";", font_map[i])

print(html)
data = etree.HTML(html)