最近公司看到小伙子在做这个类似于闯关的爬虫训练,闲来无事做一下。
因为这个还算是比较简单些的,就不讲解了。代码中很清晰,因为简单写的,所以在计算速率上比较慢一些。不过答案是正确的的哈哈
#!/usr/bin/python3
# -*- coding:utf-8 -*-
"""
@author: Cjp
@file: 3.py
@time: 2020/9/4 10:23
"""
import time
import requests
import re
import base64
import pandas as pd
from lxml import etree
from fontTools.ttLib import TTFont
class GlidedSky(object):
def __init__(self):
self.url = 'http://glidedsky.com/level/web/crawler-font-puzzle-1?page={}'
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36',
'Cookie' :'_ga=GA1.2.2076166588.1599182485; Hm_lvt_020fbaad6104bcddd1db12d6b78812f6=1599182485,1599184017; __gads=ID=c94e2e5e9c6d1406:T=1599184036:S=ALNI_Ma_JRzKSRhVrnobJU2CRfVfW3AS7A; footprints=eyJpdiI6IkQrYzM4dGtnMmFFOHlUZ3NoT3B4Rmc9PSIsInZhbHVlIjoiTnliMGNWaytFeG5uZkxacVFRRzF4Z3ZOVWVpb09pN2RHa0xDMFNUV0g3MVo5a0dkc1ZZMUhYVkFueTFIbFBSeSIsIm1hYyI6IjUzMDVkMTk4MjkyYzBlMmUzYmMxMjgwMDVjNzM3MGQ2YzU0YjBiZmMzMGEzNGQ5OGNjZTc4ZWNjZDRjMmRiOGYifQ%3D%3D; _gid=GA1.2.1046180803.1599808017; XSRF-TOKEN=eyJpdiI6IjEySDZcL3JteVYzaGUrU3AzXC9DaDNzZz09IiwidmFsdWUiOiIrWXBBbjZlbktmckthNVZPcVZodGNCNEhCWVZFd1dXbzEzVFRVY3RubnpOcVdHYmJkWDFsYzN6SkcrQ3Exb3RBIiwibWFjIjoiZjVhYzAyMzQxMTgxMjZhMGY5YTY4ZTZiOGNkY2NiYjI1NjgwYjZiZDljMzIyOGYwZDkzNWMxNWY5MjVlYWJhYyJ9; glidedsky_session=eyJpdiI6ImpGWnlHbTVYV1AxbEYrSnRkcHBZWmc9PSIsInZhbHVlIjoiallUZDNzUlk1RWowSklycjVTZjkrbEh1eDlYaGtYUHZpRE94akFCVVVTcUFqbUllUEV4RWthSkhVTFNmMW1RaiIsIm1hYyI6IjlhMmEwM2E1YTc1Mjk1NTk0MWQ1MWQwY2JiYmEyMTJiMDUzODRmNjkwMThiNjVlMWVlZGI4ZTA1YjBiOGM5MDQifQ%3D%3D; Hm_lpvt_020fbaad6104bcddd1db12d6b78812f6=1599809958',
'Host': 'glidedsky.com',
}
def get_results(self,url):
response = requests.get(url=url, headers=self.headers)
ttf = re.findall('src: url\(data:font;charset=utf-8;base64,(.*?)\)', response.text)
if len(ttf) > 0:
newttf = base64.b64decode(ttf[0])
with open('Glisky.ttf', 'wb') as f:
f.write(newttf)
data = self.get_xml_ttf()
tree = etree.HTML(response.text)
results = tree.xpath('//div[@class="row"]/div')
bbtt = 0
for b in results:
a = b.xpath('./text()')[0].replace('\n','').strip()
t = []
for i in a:
for key,value in data.items():
if i==value:
t.append(key)
zhengti = ''.join(t)
bbtt = bbtt + int(zhengti)
# print(bbtt)
return bbtt
def get_xml_ttf(self):
font = TTFont('Glisky.ttf')
font.saveXML('GGlisky.xml')
get_Glyph = font.getGlyphOrder()
#获取映射关系
get_info= {
'0':self.get_number(get_Glyph[1]),#这个是原页面数据
'1':self.get_number(get_Glyph[2]),
'2':self.get_number(get_Glyph[3]),
'3':self.get_number(get_Glyph[4]),
'4':self.get_number(get_Glyph[5]),
'5':self.get_number(get_Glyph[6]),
'6':self.get_number(get_Glyph[7]),
'7':self.get_number(get_Glyph[8]),
'8':self.get_number(get_Glyph[9]),
'9':self.get_number(get_Glyph[10])
}
return get_info
#转数字
def get_number(self,en_number):
if en_number == 'zero':
return '0'
elif en_number == 'one':
return '1'
elif en_number == 'two':
return '2'
elif en_number == 'three':
return '3'
elif en_number == 'four':
return '4'
elif en_number == 'five':
return '5'
elif en_number == 'six':
return '6'
elif en_number == 'seven':
return '7'
elif en_number == 'eight':
return '8'
elif en_number == 'nine':
return '9'
if __name__ == '__main__':
gs = GlidedSky()
con = 0
conn = 0
for page in range(1,1001):
new_url = gs.url.format(page)
c = gs.get_results(new_url)
conn = con+conn+c
print(conn)
print(conn)