58同城 房产字体 反爬
-
获取HTML
url = "https://sz.58.com/zufang/" UA = { "referer": "https://www.google.com/", "user-agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.183 Safari/537.36" } html = requests.get(url=url, headers=UA) html.encoding="utf-8" ret = html.text with open('58-fangchan.txt', "w", encoding="utf-8") as f: f.write(ret)
-
获取字体文件
# 1. 获取字体文件, with open("58-fangchan.txt", "r", encoding='utf-8') as f: content = f.read() font_face = re.search(r"base64,(.*?)'\)", content).group(1)
-
转换 xml, 分析关系
from fontTools.ttLib import TTFont font = TTFont("./b-58fangchan字体.ttf") font.saveXML("b-58fangchan字体.xml")
-
字体的映射关系
通过,我们反复分析,得出数字取出 减1
-
全部代码
import base64 import io import re from lxml import etree from fontTools.ttLib import TTFont import requests # ret = base64.b64decode(font_face) # with open('58fangchan.ttf', 'wb') as f: # f.write(ret) # font = TTFont(ret) # font.saveXML('b-58.xml') url = "https://sz.58.com/zufang/" UA = { "referer": "https://www.google.com/", "user-agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.183 Safari/537.36" } # html = requests.get(url=url, headers=UA) # html.encoding="utf-8" # # ret = html.text # with open('58-fangchan.txt', "w", encoding="utf-8") as f: # f.write(ret) # 1. 获取字体文件, with open("58-fangchan.txt", "r", encoding='utf-8') as f: content = f.read() font_face = re.search(r"base64,(.*?)'\)", content).group(1) # print(font_face) # 2. base64 解码 ret = base64.b64decode(font_face) font = TTFont(io.BytesIO(ret)) bestcmap = font['cmap'].getBestCmap() for k, v in bestcmap.items(): # print(k) # print(v) # k 此时 是10 进制 ,转成 16进制 k = hex(k) k = k.replace('0x', '&#x') + ";" # 通过,分析得出,取出v, -1 就对应数字 v = int(re.search(r'(\d+)', v).group(0)) -1 print(k) print(v) if k in content: content = content.replace(k, str(v)) # # print(content) # # 3 .获取 标题 , resp = etree.HTML(content) lis = resp.xpath("//ul[@class='listUl']/li") for li in lis: title = li.xpath('./div[@class="des"]/h2/a/text()') if title: title = title[0] print(title)