爬虫刷题-glidedsky

最近群里传的很火的一个爬虫练习网站.(网站做的非常走心, 支持一下.)

url : http://glidedsky.com/
在这里插入图片描述

做了几个题感觉非常有意思, 和大家交流分享一波.

第一题:

  • 这里有一个网站,里面有一些数字。把这些数字的总和,输入到答案框里面,即可通过本关. 这个咱就不说了, 把数据全都取出相加就可以通关了.

第二题:

  • 在第一题的基础上加了翻页.在code中加个链接迭代就行了.

第三题(IP 屏蔽):

  • 从这开始就有意思了.每个 IP,只能访问一次,之后就会被封禁。一共1000页.也就是要最少1000个代理ip,
    因为没那么多ip咱就不做了.有条件的小伙伴可以选阿布云的动态ip.每次请求都是一个新的ip. 按照第二题的思路就可以搞定

第四题 (字体反爬):

在这里插入图片描述

  • 很明显可以看出从页面源码中得到的数据和实际在浏览器中看到的数字是不一样的.并且每次刷新页面源码中的数字都不同,
    说明font是动态加载的.

在这里插入图片描述

  • 源码中可以找到字体文件. 用base64解密保存成woff文件.再用fontcreator打开就可以看到他们的映射关系.
    直接上码.
import requests
import re
from lxml import etree
import base64
from fontTools.ttLib import TTFont



session = requests.session()
h = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36',
}
login_url = "http://glidedsky.com/login"
session.headers = h
sum_number = 0
a_map = {
    'one': '1',
    'two': '2',
    'three': '3',
    'four': '4',
    'five': '5',
    'six': '6',
    'seven': '7',
    'eight': '8',
    'nine': '9',
    'zero': '0'
}


def get_token():
    resp = session.get(login_url)
    _token = re.search('<meta name="csrf-token" content="(.*?)">', resp.text).group(1)
    return _token


def login(token):
    data = {
        '_token': token,
        'email': '你的账号',
        'password': '你的密码'
    }
    session.post(login_url, data=data)


def get_html(url):
    result = session.get(url)
    return result.text


def parse_font(data):
    file_name = 'font'
    base = re.search(r'base64,(.*?)\)', data, re.S).group(1)
    # base = 'AAEAAAAKAIAAAwAgT1MvMkEnQdAAAAEoAAAAYGNtYXAAUQDFAAABpAAAAEhnbHlmdUQ+YgAAAgQAAAPWaGVhZBaNFNQAAACsAAAANmhoZWEHCgOTAAAA5AAAACRobXR4BwEBNgAAAYgAAAAabG9jYQTKBcIAAAHsAAAAGG1heHAAEQA4AAABCAAAACBuYW1lQTDOUQAABdwAAAGVcG9zdAB5AH8AAAd0AAAAOAABAAAAAQAAYeWvTF8PPPUAAwPoAAAAANnu6H4AAAAA2e7ofgAU/4gDhANwAAAAAwACAAAAAAAAAAEAAANw/4gAAAPoABQAIAOEAAEAAAAAAAAAAAAAAAAAAAACAAEAAAALADYABQAAAAAAAgAAAAAAAAAAAAAAAAAAAAAAAwJTAZAABQAEAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAABAAAAAAAAAAAAAAAAPz8/PwAAADAAOQAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAgAAAD6ABkAisAMQBYACgAHQAUABwAOAAxAC0ALAAAAAAAAgAAAAMAAAAUAAMAAQAAABQABAA0AAAABAAEAAEAAAA5//8AAAAw//8AAAABAAQAAAADAAUAAQAHAAgABgACAAkABAAKAAAALABTAGkAjwDGAOgBGAFNAWQBswHrAAUAZP+IA4QDcAADAAYACQAMAA8AABMhESEBIQEBEQkDJwEBZAMg/OACzv2EAT4BXv7CAR7+wv7CIAE+/sIDcPwYA7b+Z/4+AzL+Z/4+AZn+ZykBmQGZAAACADH/8wH6AusADwAXAAA3JjU0NzYzMhcWFRQHBiMiExAjIhEQMzJvPj47bGs7Pj47a2v2i4yMi1Jku7tiXV5iurtkXwF+ATD+0P7LAAABAFgAAAHqAt0ACwAANzMRIzU2NzMRMxUhWKOCWz1Gk/5uTAIjOhEj/W9MAAEAKAAAAfkC6wAWAAA3ADU0JyYjIgcnNjMyFxYVFAE2MzMVISwBUCEkQlFHNWR0Yjo5/uFZH8v+MzYBJrNCJilVNGw7O2O6/vAHTwABAB3/8wHzAusAJQAANzcWMzI3NjU0IzUyNTQnJicGByc2MzIXFhUUBxUWFxYVFAcGIyIdLlBmQikq5MshIjlSRjFfbl86PINEKy1FQWWPVzxUJSU+k0aMNSAfAgNGOlgwMlaAMQQQLzNIYDo3AAIAFAAAAgsC3QAHABIAAAE1NDcjBgcHBSMVIzUhNQEzETMBUwYEGCOnAZhhV/7BATFlYQET4RNyMDz6ScrKPAHX/jYAAQAc//MB9QLdAB4AADc3FjMyNzY1NCcmIyIHJxMhFSEHNjMyFxYVFAcGIyIcLVFjQiwuKSlGOUExFwFl/usSNDlhO0FJRWKIVDxRLjFOTi0sKx4BV07UHTg+c3RGQgAAAgA4//MB/wLrAAkAIgAAJTY1NCMiBxYzMhMmIyIDNjMyFxYVFAcGIyInJjU0NzYzMhcBhSSEVEIRjTVeLki4BUleXzU3PjxYbkFGUkh1ZUZoL0qiXusCLTj+z1k6PHBoREJbYLDKaFtLAAEAMQAAAfwC3QAKAAAzEhMhNSEVBgcGB8YRvf6dAct6LiYJAYYBCU43nZ2D6QADAC3/8wH9AugAGQAnADUAADcmNTQ3NSY1NDc2MzIXFhUUBxUWFRQHBiMiEzQnJiMiBwYVFBcWFzYDNjU0JyYnBhUUFxYzMm9Ch2M5OVdcNzZifD9BZmXjISM5MyAhMiNQTBYnOiRkZCwrQj8qN1WBSQVEZVM0MzY1VmVMBUh4UTY3Ai84JSYhITU7KRwgQ/6JIjdCLBsoQGY6JyYAAAIALP/zAfQC6wALACQAAAEmIyIHBhUUFxYzMgcWMzITBiMiJyY1NDc2MzIXFhUUBwYjIicBng+RNSMkISJAVO0ySa8JSWBeNTc+PFhuQkZRR3JoSAG85y0vSkwrLOM4ATJbOzxwaERCV12p0WxeSwAAAAAADACWAAEAAAAAAAAAFAAAAAEAAAAAAAEACQAUAAEAAAAAAAIABwAdAAEAAAAAAAUACwAkAAEAAAAAAAYAEQAvAAEAAAAAAAsAFQBAAAMAAQQJAAAAKABVAAMAAQQJAAEAEgB9AAMAAQQJAAIADgCPAAMAAQQJAAUAFgCdAAMAAQQJAAYAIgCzAAMAAQQJAAsAKgDVQ3JlYXRlZCBieSBHbGlkZWRTa3lHbGlkZWRTa3lSZWd1bGFyVmVyc2lvbiAxLjBHbGlkZWRTa3ktUmVndWxhcmh0dHA6Ly9nbGlkZWRza3kuY29tLwBDAHIAZQBhAHQAZQBkACAAYgB5ACAARwBsAGkAZABlAGQAUwBrAHkARwBsAGkAZABlAGQAUwBrAHkAUgBlAGcAdQBsAGEAcgBWAGUAcgBzAGkAbwBuACAAMQAuADAARwBsAGkAZABlAGQAUwBrAHkALQBSAGUAZwB1AGwAYQByAGgAdAB0AHAAOgAvAC8AZwBsAGkAZABlAGQAcwBrAHkALgBjAG8AbQAvAAAAAAIAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACwAAABUAGQATABsAFAAYABYAFwAaABw='
    result = base64.b64decode(base)
    with open('font.woff', 'wb') as f:
        f.write(result)
    font = TTFont(file_name + '.woff')
    font.saveXML(file_name + '.xml')
    with open(file_name + '.xml', encoding='utf-8') as f:
        a = f.read()
    font_map = dict()
    result = re.findall('<GlyphOrder>(.*?)</GlyphOrder>', a, re.S)[0]
    alist = result.split('\n')[3:-1]
    for i in alist:
        key = re.search('<GlyphID id=".*?" name="(.*?)"/>', i.strip()).group(1)
        value = int(re.search('<GlyphID id="(.*?)" name=".*?"/>', i.strip()).group(1)) - 1
        font_map[a_map[key]] = value
    return font_map


def callback(data, font_map):
    html = etree.HTML(data)
    data = html.xpath('//*[@class="col-md-1"]/text()')
    for count in data:
        count = (count.replace('\n', '').strip())
        result = [str(font_map[ct]) for ct in count]
        global sum_number
        sum_number += int(''.join(result))


def main():
    login(get_token())
    url = "http://glidedsky.com/level/web/crawler-font-puzzle-1?page="
    url_list = [url + str(num) for num in range(1, 1001)]
    for url in url_list:
        data = get_html(url)
        font_map = parse_font(data)
        callback(data, font_map)


if __name__ == '__main__':

    main()
    print(sum_number)

第五题 CSS反爬:

  • 观察一下classname, 在源码的style标签中可以找到对应的属性. 多对比几次就会发现规律

在这里插入图片描述
继续上码

import requests
import re
from lxml import etree
from operator import itemgetter



session = requests.session()
h = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36',
}
login_url = "http://glidedsky.com/login"
session.headers = h
sum_number = 0
items = []


def get_token():
    resp = session.get(login_url)
    _token = re.search('<meta name="csrf-token" content="(.*?)">', resp.text).group(1)
    return _token


def login(token):
    data = {
        '_token': token,
        'email': '你的账号',
        'password': '你的密码'
    }
    session.post(login_url, data=data)


def get_html(url):
    result = session.get(url)
    return result.text

def get_data(response):
    html = etree.HTML(response)
    divs = html.xpath('//div[@class="row"]/div')
    item = []
    for div in divs:
        div_list = div.xpath('./div')
        a_items = []
        left = 1
        for cla in div_list:
            class_name = cla.xpath('./@class')
            res = class_name[0] + r'.*{( .* )?}'
            div_num = cla.xpath('./text()')
            class_value = re.findall(res, response)
            item = {}
            for value in class_value:
                vals = value.split(':')
                item[vals[0].strip()] = vals[1].strip().strip('em')
            valu = False
            if 'opacity' in item:
                if item['opacity'] == 0:
                    pass
            elif 'content' in item:
                result = item['content']
                items.append(int(result.strip('"')))
            elif 'left' in item:
                valu = left + int(item['left'])
            else:
                valu = left

            if valu:
                left += 1
                item_num = dict()
                item_num['num'] = ''.join(div_num)
                item_num['valu'] = valu
                a_items.append(item_num)
                print(a_items)

        if a_items:
            a_items.sort(key=itemgetter('valu'), reverse=False)
            nums = ''
            for item in a_items:
                num = item['num']
                num = item['num']
                nums += num
            items.append(int(nums))

    return items




def main():
    login(get_token())
    url = 'http://glidedsky.com/level/web/crawler-css-puzzle-1?page=1'
    response = get_html(url)
    html = etree.HTML(response)
    get_data(response)
    print(items)


if __name__ == '__main__':
    main()

暂时就做了这几个, 剩下的有时间继续…

  • 7
    点赞
  • 22
    收藏
    觉得还不错? 一键收藏
  • 2
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值