Python 爬虫处理字体加密
汽车之家:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import requests
from lxml import etree
import re
import sys
import io
from fontTools.ttLib import TTFont
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='gb18030')
# 抓取autohome评论
class AutoSpider:
# 页面初始化
def __init__(self):
self.headers = {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
"Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "zh-CN,zh;q=0.8",
"Cache-Control": "max-age=0",
"Connection": "keep-alive",
"Upgrade-Insecure-Requests": "1",
"Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.86 Safari/537.36",
'host': 'club.autohome.com.cn',
'cookie': '__ah_uuid=C526DAD3-76F6-42C8-956B-4CBE18611E7B; fvlid=1545293100124hzVSzLWmuB; sessionip=61.149.5.137; area=119999; sessionid=60F897CD-E743-449D-BEBE-44D6533DE992%7C%7C2018-12-20+16%3A05%3A06.291%7C%7Cwww.baidu.com; ahpau=1; sessionuid=60F897CD-E743-449D-BEBE-44D6533DE992%7C%7C2018-12-20+16%3A05%3A06.291%7C%7Cwww.baidu.com; pbcpopclub=0fb65f3c-0c57-43b3-ade4-2752c0517737; ref=www.baidu.com%7C0%7C0%7C0%7C2018-12-20+17%3A56%3A26.073%7C2018-12-20+16%3A05%3A06.291; autoac=DB6482147B98F5F9B9D0834939744526; autotc=3A5FEFF1E14636EA47902DA601BB1DF6; ahpvno=12'}
# 获取评论
def getNote(self):
url = "https://club.autohome.com.cn/bbs/thread-c-2778-69436529-1.html"
# 获取页面内容
r = requests.get(url, headers=self.headers)
html = etree.HTML(r.text)
# 匹配ttf font
cmp = re.compile(",url\('(//.*.ttf)'\)")
rst = cmp.findall(r.text)
ttf = requests.get("http:" + rst[0], stream=True)
with open("autohome.ttf", "wb") as pdf:
for chunk in ttf.iter_content(chunk_size=1024):
if chunk:
pdf.write(chunk)
# 解析字体库font文件
font = TTFont('autohome.ttf')
uniList = font['cmap'].tables[0].ttFont.getGlyphOrder()
utf8List = [str(uni[3:]) for uni in uniList[1:]]
wordList = ['一', '七', '三', '上', '下', '不', '中', '档', '比', '油', '泥', '灯',
'九', '了', '二', '五', '低', '保', '光', '八', '公', '六', '养', '内', '冷',
'副', '加', '动', '十', '电', '的', '皮', '盘', '真', '着', '路', '身', '软',
'过', '近', '远', '里', '量', '长', '门', '问', '只', '右', '启', '呢', '味',
'和', '响', '四', '地', '坏', '坐', '外', '多', '大', '好', '孩', '实', '小',
'少', '短', '矮', '硬', '空', '级', '耗', '雨', '音', '高', '左', '开', '当',
'很', '得', '性', '自', '手', '排', '控', '无', '是', '更', '有', '机', '来']
print(utf8List)
# 获取发帖内容
text = html.xpath("string(//div[@class='tz-paragraph'])")
# note = [ii.replace("\r", "").replace("\n", "") for ii in text]
# notes = [i.replace("\\u", "") for i in note]
# print(notes)
for i in range(len(utf8List)):
text = text.replace(utf8List[i], wordList[i])
print(text)
spider = AutoSpider()
spider.getNote()
猫眼:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import re
import requests
from fontTools.ttLib import TTFont
from lxml import etree
def job():
headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/66.0.3359.139 Safari/537.36 "
}
index_url = 'http://maoyan.com/'
# 获取首页内容
response_index = requests.get(index_url, headers=headers).text
index_xml = etree.HTML(response_index)
info_list = index_xml.xpath('//*[@id="app"]/div/div[1]/div[1]/div/div[2]/ul/li[1]/a/div[2]/div//text()')
a = u'电影名称:%s, 票房总数:%s' % (info_list[1], info_list[4])
print(a)
# 获取字体文件的url
woff_ = re.search(r"url\('(.*\.woff)'\)", response_index).group(1)
woff_url = 'http:' + woff_
response_woff = requests.get(woff_url, headers=headers).content
with open('fonts.woff', 'wb') as f:
f.write(response_woff)
# base_nums, base_fonts 需要自己手动解析映射关系, 要和basefonts.woff一致
baseFonts = TTFont('basefonts.woff')
base_nums = ['7', '9', '0', '3', '6', '5', '2', '1', '4', '8']
base_fonts = ['uniF04C', 'uniE374', 'uniF426', 'uniEAAA', 'uniF519', 'uniEEC4', 'uniF543', 'uniF7C7', 'uniF046',
'uniF08E']
onlineFonts = TTFont('fonts.woff')
# onlineFonts.saveXML('test.xml')
uni_list = onlineFonts.getGlyphNames()[1:-1]
temp = {}
# 解析字体库
for i in range(10):
onlineGlyph = onlineFonts['glyf'][uni_list[i]]
for j in range(10):
baseGlyph = baseFonts['glyf'][base_fonts[j]]
if onlineGlyph == baseGlyph:
temp["&#x" + uni_list[i][3:].lower() + ';'] = base_nums[j]
# 字符替换
pat = '(' + '|'.join(temp.keys()) + ')'
response_index = re.sub(pat, lambda x: temp[x.group()], response_index)
# 内容提取
index_xml = etree.HTML(response_index)
info_list = index_xml.xpath('//*[@id="app"]/div/div[1]/div[1]/div/div[2]/ul/li[1]/a/div[2]/div//text()')
a = u'电影名称:%s, 票房总数:%s' % (info_list[1], info_list[4])
print(a)
def ttf_to_xml():
onlineFonts = TTFont('base.woff')
onlineFonts.saveXML('base.xml')
去哪儿网:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from fontTools.ttLib import TTFont
import requests
from datetime import datetime
import json
phone_headers = {
'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1'}
web_headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36'
}
def job():
# 这个是与上面的字体文件相对应的
number_dict = {
"period": ".",
"zero": "0",
"one": "1",
"two": "2",
"three": "3",
"four": "4",
"five": "5",
"six": "6",
"seven": "7",
"eight": "8",
"nine": "9"
}
font_url = get_font_ttf()
dd = font_url.split('/')[-2:]
name = dd[0] + dd[1]
font_content = requests.get(font_url, headers=web_headers).content
# print(font_content)
with open(name, 'wb') as f:
f.write(font_content)
font = TTFont(name)
font.saveXML(name.replace("ttf", 'xml'))
def get_font_ttf():
post_data = {"arrCity": "上海",
"depCity": "北京",
"flightType": "oneWay",
"from": "touch_index_guess",
"goDate": datetime.now().strftime('%Y-%m-%d'),
"sort": "1",
"firstRequest": "true",
"startNum": 0,
"r": 1544747204962,
"_v": 2,
"underageOption": "",
"__m__": "09163ba3379128886841f72d76aa525e"}
post_data2 = {
'arrCity': "上海",
'baby': "0",
'cabinType': "0",
'child': "0",
'depCity': "北京",
'firstRequest': 'true',
'from': "touch_index_search",
'goDate': datetime.now().strftime('%Y-%m-%d'),
'r': 1544750638857,
'sort': 5,
'startNum': 0,
'underageOption': "",
'__m__': "fa4863f52526dbbe3b3cba0e3de7e006",
'_v': 2
}
data = requests.post('https://m.flight.qunar.com/touch/api/domestic/flightlist', data=post_data2)
dd = json.loads(data.text)
font_src = "https:" + dd['data']['obfuscate']['fontSrc']
print(font_src)
return font_src
if __name__ == '__main__':
job()
猫眼网站:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import requests
from lxml import html
import re
import woff2otf
from fontTools.ttLib import TTFont
from bs4 import BeautifulSoup as bs
# 抓取maoyan票房
class MaoyanSpider:
# 页面初始化
def __init__(self):
self.headers = {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
"Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "zh-CN,zh;q=0.8",
"Cache-Control": "max-age=0",
"Connection": "keep-alive",
"Upgrade-Insecure-Requests": "1",
"Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.86 Safari/537.36"
}
# 获取票房
def getNote(self):
url = "http://maoyan.com/cinema/15887?poi=91871213"
host = {'host': 'maoyan.com',
'refer': 'http://maoyan.com/news', }
headers = dict(self.headers.items() + host.items())
# 获取页面内容
r = requests.get(url, headers=headers)
# print r.text
response = html.fromstring(r.text)
u = r.text
# 匹配ttf font
cmp = re.compile(",\nurl\('(//.*.woff)'\) format\('woff'\)")
rst = cmp.findall(r.text)
ttf = requests.get("http:" + rst[0], stream=True)
with open("maoyan.woff", "wb") as pdf:
for chunk in ttf.iter_content(chunk_size=1024):
if chunk:
pdf.write(chunk)
# 转换woff字体为otf字体
woff2otf.convert('maoyan.woff', 'maoyan.otf')
# 解析字体库font文件
baseFont = TTFont('base.otf')
maoyanFont = TTFont('maoyan.otf')
uniList = maoyanFont['cmap'].tables[0].ttFont.getGlyphOrder()
numList = []
baseNumList = ['.', '3', '5', '1', '2', '7', '0', '6', '9', '8', '4']
baseUniCode = ['x', 'uniE64B', 'uniE183', 'uniED06', 'uniE1AC', 'uniEA2D', 'uniEBF8',
'uniE831', 'uniF654', 'uniF25B', 'uniE3EB']
for i in range(1, 12):
maoyanGlyph = maoyanFont['glyf'][uniList[i]]
for j in range(11):
baseGlyph = baseFont['glyf'][baseUniCode[j]]
if maoyanGlyph == baseGlyph:
numList.append(baseNumList[j])
break
uniList[1] = 'uni0078'
utf8List = [eval("u'\\u" + uni[3:] + "'").encode("utf-8") for uni in uniList[1:]]
# 获取发帖内容
soup = bs(u, "html.parser")
index = soup.find_all('div', {'class': 'show-list'})
print('---------------Prices-----------------')
for n in range(len(index)):
mn = soup.find_all('h3', {'class': 'movie-name'})
ting = soup.find_all('span', {'class': 'hall'})
mt = soup.find_all('span', {'class': 'begin-time'})
mw = soup.find_all('span', {'class': 'stonefont'})
for i in range(len(mt)):
moviename = mn[i].get_text()
film_ting = ting[i].get_text()
movietime = mt[i].get_text()
moviewish = mw[i].get_text().encode('utf-8')
for i in range(len(utf8List)):
moviewish = moviewish.replace(utf8List[i], numList[i])
print(moviename, film_ting, movietime, moviewish)
spider = MaoyanSpider()
spider.getNote()
完整代码下载:https://github.com/tanjunchen/SpiderProject/tree/master/fontfaceDecrypt