最近群里传的很火的一个爬虫练习网站.(网站做的非常走心, 支持一下.)
url : http://glidedsky.com/
做了几个题感觉非常有意思, 和大家交流分享一波.
第一题:
- 这里有一个网站,里面有一些数字。把这些数字的总和,输入到答案框里面,即可通过本关. 这个咱就不说了, 把数据全都取出相加就可以通关了.
第二题:
- 在第一题的基础上加了翻页.在code中加个链接迭代就行了.
第三题(IP 屏蔽):
- 从这开始就有意思了.每个 IP,只能访问一次,之后就会被封禁。一共1000页.也就是要最少1000个代理ip,
因为没那么多ip咱就不做了.有条件的小伙伴可以选阿布云的动态ip.每次请求都是一个新的ip. 按照第二题的思路就可以搞定
第四题 (字体反爬):
- 很明显可以看出从页面源码中得到的数据和实际在浏览器中看到的数字是不一样的.并且每次刷新页面源码中的数字都不同,
说明font是动态加载的.
- 源码中可以找到字体文件. 用base64解密保存成woff文件.再用fontcreator打开就可以看到他们的映射关系.
直接上码.
import requests
import re
from lxml import etree
import base64
from fontTools.ttLib import TTFont
session = requests.session()
h = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36',
}
login_url = "http://glidedsky.com/login"
session.headers = h
sum_number = 0
a_map = {
'one': '1',
'two': '2',
'three': '3',
'four': '4',
'five': '5',
'six': '6',
'seven': '7',
'eight': '8',
'nine': '9',
'zero': '0'
}
def get_token():
resp = session.get(login_url)
_token = re.search('<meta name="csrf-token" content="(.*?)">', resp.text).group(1)
return _token
def login(token):
data = {
'_token': token,
'email': '你的账号',
'password': '你的密码'
}
session.post(login_url, data=data)
def get_html(url):
result = session.get(url)
return result.text
def parse_font(data):
file_name = 'font'
base = re.search(r'base64,(.*?)\)', data, re.S).group(1)
# base = 'AAEAAAAKAIAAAwAgT1MvMkEnQdAAAAEoAAAAYGNtYXAAUQDFAAABpAAAAEhnbHlmdUQ+YgAAAgQAAAPWaGVhZBaNFNQAAACsAAAANmhoZWEHCgOTAAAA5AAAACRobXR4BwEBNgAAAYgAAAAabG9jYQTKBcIAAAHsAAAAGG1heHAAEQA4AAABCAAAACBuYW1lQTDOUQAABdwAAAGVcG9zdAB5AH8AAAd0AAAAOAABAAAAAQAAYeWvTF8PPPUAAwPoAAAAANnu6H4AAAAA2e7ofgAU/4gDhANwAAAAAwACAAAAAAAAAAEAAANw/4gAAAPoABQAIAOEAAEAAAAAAAAAAAAAAAAAAAACAAEAAAALADYABQAAAAAAAgAAAAAAAAAAAAAAAAAAAAAAAwJTAZAABQAEAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAABAAAAAAAAAAAAAAAAPz8/PwAAADAAOQAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAgAAAD6ABkAisAMQBYACgAHQAUABwAOAAxAC0ALAAAAAAAAgAAAAMAAAAUAAMAAQAAABQABAA0AAAABAAEAAEAAAA5//8AAAAw//8AAAABAAQAAAADAAUAAQAHAAgABgACAAkABAAKAAAALABTAGkAjwDGAOgBGAFNAWQBswHrAAUAZP+IA4QDcAADAAYACQAMAA8AABMhESEBIQEBEQkDJwEBZAMg/OACzv2EAT4BXv7CAR7+wv7CIAE+/sIDcPwYA7b+Z/4+AzL+Z/4+AZn+ZykBmQGZAAACADH/8wH6AusADwAXAAA3JjU0NzYzMhcWFRQHBiMiExAjIhEQMzJvPj47bGs7Pj47a2v2i4yMi1Jku7tiXV5iurtkXwF+ATD+0P7LAAABAFgAAAHqAt0ACwAANzMRIzU2NzMRMxUhWKOCWz1Gk/5uTAIjOhEj/W9MAAEAKAAAAfkC6wAWAAA3ADU0JyYjIgcnNjMyFxYVFAE2MzMVISwBUCEkQlFHNWR0Yjo5/uFZH8v+MzYBJrNCJilVNGw7O2O6/vAHTwABAB3/8wHzAusAJQAANzcWMzI3NjU0IzUyNTQnJicGByc2MzIXFhUUBxUWFxYVFAcGIyIdLlBmQikq5MshIjlSRjFfbl86PINEKy1FQWWPVzxUJSU+k0aMNSAfAgNGOlgwMlaAMQQQLzNIYDo3AAIAFAAAAgsC3QAHABIAAAE1NDcjBgcHBSMVIzUhNQEzETMBUwYEGCOnAZhhV/7BATFlYQET4RNyMDz6ScrKPAHX/jYAAQAc//MB9QLdAB4AADc3FjMyNzY1NCcmIyIHJxMhFSEHNjMyFxYVFAcGIyIcLVFjQiwuKSlGOUExFwFl/usSNDlhO0FJRWKIVDxRLjFOTi0sKx4BV07UHTg+c3RGQgAAAgA4//MB/wLrAAkAIgAAJTY1NCMiBxYzMhMmIyIDNjMyFxYVFAcGIyInJjU0NzYzMhcBhSSEVEIRjTVeLki4BUleXzU3PjxYbkFGUkh1ZUZoL0qiXusCLTj+z1k6PHBoREJbYLDKaFtLAAEAMQAAAfwC3QAKAAAzEhMhNSEVBgcGB8YRvf6dAct6LiYJAYYBCU43nZ2D6QADAC3/8wH9AugAGQAnADUAADcmNTQ3NSY1NDc2MzIXFhUUBxUWFRQHBiMiEzQnJiMiBwYVFBcWFzYDNjU0JyYnBhUUFxYzMm9Ch2M5OVdcNzZifD9BZmXjISM5MyAhMiNQTBYnOiRkZCwrQj8qN1WBSQVEZVM0MzY1VmVMBUh4UTY3Ai84JSYhITU7KRwgQ/6JIjdCLBsoQGY6JyYAAAIALP/zAfQC6wALACQAAAEmIyIHBhUUFxYzMgcWMzITBiMiJyY1NDc2MzIXFhUUBwYjIicBng+RNSMkISJAVO0ySa8JSWBeNTc+PFhuQkZRR3JoSAG85y0vSkwrLOM4ATJbOzxwaERCV12p0WxeSwAAAAAADACWAAEAAAAAAAAAFAAAAAEAAAAAAAEACQAUAAEAAAAAAAIABwAdAAEAAAAAAAUACwAkAAEAAAAAAAYAEQAvAAEAAAAAAAsAFQBAAAMAAQQJAAAAKABVAAMAAQQJAAEAEgB9AAMAAQQJAAIADgCPAAMAAQQJAAUAFgCdAAMAAQQJAAYAIgCzAAMAAQQJAAsAKgDVQ3JlYXRlZCBieSBHbGlkZWRTa3lHbGlkZWRTa3lSZWd1bGFyVmVyc2lvbiAxLjBHbGlkZWRTa3ktUmVndWxhcmh0dHA6Ly9nbGlkZWRza3kuY29tLwBDAHIAZQBhAHQAZQBkACAAYgB5ACAARwBsAGkAZABlAGQAUwBrAHkARwBsAGkAZABlAGQAUwBrAHkAUgBlAGcAdQBsAGEAcgBWAGUAcgBzAGkAbwBuACAAMQAuADAARwBsAGkAZABlAGQAUwBrAHkALQBSAGUAZwB1AGwAYQByAGgAdAB0AHAAOgAvAC8AZwBsAGkAZABlAGQAcwBrAHkALgBjAG8AbQAvAAAAAAIAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACwAAABUAGQATABsAFAAYABYAFwAaABw='
result = base64.b64decode(base)
with open('font.woff', 'wb') as f:
f.write(result)
font = TTFont(file_name + '.woff')
font.saveXML(file_name + '.xml')
with open(file_name + '.xml', encoding='utf-8') as f:
a = f.read()
font_map = dict()
result = re.findall('<GlyphOrder>(.*?)</GlyphOrder>', a, re.S)[0]
alist = result.split('\n')[3:-1]
for i in alist:
key = re.search('<GlyphID id=".*?" name="(.*?)"/>', i.strip()).group(1)
value = int(re.search('<GlyphID id="(.*?)" name=".*?"/>', i.strip()).group(1)) - 1
font_map[a_map[key]] = value
return font_map
def callback(data, font_map):
html = etree.HTML(data)
data = html.xpath('//*[@class="col-md-1"]/text()')
for count in data:
count = (count.replace('\n', '').strip())
result = [str(font_map[ct]) for ct in count]
global sum_number
sum_number += int(''.join(result))
def main():
login(get_token())
url = "http://glidedsky.com/level/web/crawler-font-puzzle-1?page="
url_list = [url + str(num) for num in range(1, 1001)]
for url in url_list:
data = get_html(url)
font_map = parse_font(data)
callback(data, font_map)
if __name__ == '__main__':
main()
print(sum_number)
第五题 CSS反爬:
- 观察一下classname, 在源码的style标签中可以找到对应的属性. 多对比几次就会发现规律
继续上码
import requests
import re
from lxml import etree
from operator import itemgetter
session = requests.session()
h = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36',
}
login_url = "http://glidedsky.com/login"
session.headers = h
sum_number = 0
items = []
def get_token():
resp = session.get(login_url)
_token = re.search('<meta name="csrf-token" content="(.*?)">', resp.text).group(1)
return _token
def login(token):
data = {
'_token': token,
'email': '你的账号',
'password': '你的密码'
}
session.post(login_url, data=data)
def get_html(url):
result = session.get(url)
return result.text
def get_data(response):
html = etree.HTML(response)
divs = html.xpath('//div[@class="row"]/div')
item = []
for div in divs:
div_list = div.xpath('./div')
a_items = []
left = 1
for cla in div_list:
class_name = cla.xpath('./@class')
res = class_name[0] + r'.*{( .* )?}'
div_num = cla.xpath('./text()')
class_value = re.findall(res, response)
item = {}
for value in class_value:
vals = value.split(':')
item[vals[0].strip()] = vals[1].strip().strip('em')
valu = False
if 'opacity' in item:
if item['opacity'] == 0:
pass
elif 'content' in item:
result = item['content']
items.append(int(result.strip('"')))
elif 'left' in item:
valu = left + int(item['left'])
else:
valu = left
if valu:
left += 1
item_num = dict()
item_num['num'] = ''.join(div_num)
item_num['valu'] = valu
a_items.append(item_num)
print(a_items)
if a_items:
a_items.sort(key=itemgetter('valu'), reverse=False)
nums = ''
for item in a_items:
num = item['num']
num = item['num']
nums += num
items.append(int(nums))
return items
def main():
login(get_token())
url = 'http://glidedsky.com/level/web/crawler-css-puzzle-1?page=1'
response = get_html(url)
html = etree.HTML(response)
get_data(response)
print(items)
if __name__ == '__main__':
main()
暂时就做了这几个, 剩下的有时间继续…