搞了一下午的猫眼字体反爬, 记录一下踩得坑
1、寻找字体文件,进行下载观察
我们点开chrome浏览器的开发者模式发现 Elements 里面的数字都为方框框 首先想到猫眼对字体进行了字体加密,所以继续查看网页源代码 发现 用户评分部分为 &#开头的数字和字母的集合 如下图:
现在对在 Font 中寻找加密字体文件 在字体文件中寻找headers中网址链接进行下载 如图:
2. 下载以后进行对比
下载以后 进行打开后缀为.woff结尾的文件 与源代码中字体进行对比 发现如下:
多次请求发现字体加密为动态加密, 发现规律以后进行 ,写代码
# 定义通用用请求方法
def get_html(url):
response = requests.get(url, headers=headers)
return response
# 首先获取字体文件
def get_font():
maoyan_html = get_html('https://maoyan.com/films/1190122')
# print(maoyan_html.url)
# 利用正则进行文本配字体文件进行下载
pattern = re.compile("url\('(//vfile.meituan.net/colorstone/.*?.woff)'\) format\('woff'\);")
print(pattern.findall(maoyan_html.text))
font_url = 'http:' + pattern.findall(maoyan_html.text)[0]
file_name = font_url.split('/')[-1]
font_response = get_html(font_url)
with open(file_name, mode='wb') as f:
f.write(font_response.content)
return file_name, maoyan_html.text
将字体文件下载以后进行 转换 换成 xml格式文件 进行观察 根据其像素点进行绘画, 画到一张照片上 ,利用pytesseract 进行图片识别 代码如下:
def get_map_font(file_name):
font = TTFont(file_name)
# 获取字体的编码
font.saveXML('font.xml')
code_list = font.getGlyphOrder()[2:]
# 新建一张图片
im = Image.new("RGB", (1800, 1800), (255, 255, 255))
image_draw = ImageDraw.Draw(im)
font = ImageFont.truetype(file_name, 40)
print(code_list)
new_list = [code.replace('uni', '\\u') for code in code_list]
print('替换之后', new_list)
text = ''.join(new_list)
# print(text)
text = text.encode('utf-8').decode('unicode_escape')
# print(text)
image_draw.text((0, 100), text, font=font, fill="#000000")
im.save("sss.jpg")
im = Image.open("sss.jpg")
res = pytesseract.image_to_string(im, lang="chi_sim")
# print(res)
res_str = [i for i in res]
print(res_str)
# 进行替换
html_code_list = [i.lower().replace("uni", "&#x") + ";" for i in code_list]
print(html_code_list)
result = dict(zip(html_code_list, res_str))
print(result)
return result
识别以后就可的到字典的对应关系 进行HTML文件的替换 和提取 , 只为了判断是否解密成功我只提取了用户评分 代码如下:
def replace(html, pattern):
for k, v in pattern.items():
html = html.replace(k, v)
print(k, v)
with open('替换后.html', mode='w', encoding='utf-8') as f:
f.write(html)
return html
def parser(html):
selector = parsel.Selector(html)
aim = selector.css(".index-left .stonefont::text").getall()
return aim
完整代码如下:
# -*- coding: UTF-8 -*-
import parsel
import pytesseract
import requests
import re
from fontTools.ttLib import TTFont
from PIL import Image, ImageDraw, ImageFont
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.100 Safari/537.36'
}
def get_html(url):
response = requests.get(url, headers=headers)
return response
def get_font():
maoyan_html = get_html('https://maoyan.com/films/1190122')
# print(maoyan_html.url)
pattern = re.compile("url\('(//vfile.meituan.net/colorstone/.*?.woff)'\) format\('woff'\);")
print(pattern.findall(maoyan_html.text))
font_url = 'http:' + pattern.findall(maoyan_html.text)[0]
file_name = font_url.split('/')[-1]
font_response = get_html(font_url)
with open(file_name, mode='wb') as f:
f.write(font_response.content)
return file_name, maoyan_html.text
def get_map_font(file_name):
font = TTFont(file_name)
# 获取字体的编码
font.saveXML('font.xml')
code_list = font.getGlyphOrder()[2:]
# 新建一张图片
im = Image.new("RGB", (1800, 1800), (255, 255, 255))
image_draw = ImageDraw.Draw(im)
font = ImageFont.truetype(file_name, 40)
print(code_list)
new_list = [code.replace('uni', '\\u') for code in code_list]
print('替换之后', new_list)
text = ''.join(new_list)
# print(text)
text = text.encode('utf-8').decode('unicode_escape')
# print(text)
image_draw.text((0, 100), text, font=font, fill="#000000")
im.save("sss.jpg")
im = Image.open("sss.jpg")
res = pytesseract.image_to_string(im, lang="chi_sim")
# print(res)
res_str = [i for i in res]
print(res_str)
html_code_list = [i.lower().replace("uni", "&#x") + ";" for i in code_list]
print(html_code_list)
result = dict(zip(html_code_list, res_str))
print(result)
return result
def replace(html, pattern):
# new_html = html
for k, v in pattern.items():
html = html.replace(k, v)
print(k, v)
with open('替换后.html', mode='w', encoding='utf-8') as f:
f.write(html)
return html
def parser(html):
selector = parsel.Selector(html)
aim = selector.css(".index-left .stonefont::text").getall()
return aim
if __name__ == '__main__':
filename = get_font()
a = get_map_font(filename[0])
htm = replace(filename[1], a)
print(parser(htm))
替换结果如图所示:
注:如有报错请打印 获取的url 有可能请求多了猫眼要进行验证 打印出url进行点击转到网页 划过验证 继续请求即可 如图: