glidedsky爬虫之css反爬虫

快下班了快下班了,话不多说先放代码,不懂得可以留言联系!重在找规律!!!!

"""
@author: Cjp
@file: cssfan.py
@time: 2020/9/11 16:37
"""

import re
from operator import itemgetter
import requests
from lxml import etree


def gao(url):
    items = []

    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36',
        'Cookie': '_ga=GA1.2.2076166588.1599182485; __gads=ID=c94e2e5e9c6d1406:T=1599184036:S=ALNI_Ma_JRzKSRhVrnobJU2CRfVfW3AS7A; footprints=eyJpdiI6InU5eklsSU9NSkhCWFVIaEkyUnM0cGc9PSIsInZhbHVlIjoiaG9wUEJqejI0MXlvZDZzc3c0T2NSRHhoTVVkcEZHcTJqTDZYdldqQTA4ZkY5bk1KYVRPR3l5dEFJZlNmM1FXNyIsIm1hYyI6Ijc2NDMzOTEwMDhmMGE0YzM5ZDAxNThhMzAwMzZhZjFlNWZiZGUzNWU0MWZjNTIyNzU3ZWRhZmY2ODdhNmJhMWMifQ%3D%3D; Hm_lvt_020fbaad6104bcddd1db12d6b78812f6=1599182485,1599184017,1600066558; _gid=GA1.2.1388436899.1600066559; _gat_gtag_UA_75859356_3=1; XSRF-TOKEN=eyJpdiI6IkNnVGc2RWRJMnIxWnc3SDNpNGFYRXc9PSIsInZhbHVlIjoiQzNCZzg2NVYwQ2pCVGloREpva1M4RFBZbGIwaTRWQTBTQXJPNXFLc0RONDdPYWhUVjlKdzZuWGNcL09uNUZvSmEiLCJtYWMiOiI3YzRlNGM0M2EwYzE2NmY2MDk5OTZmNWMzYWRlMzI2OWRmZDNlNzEzMzUxMmExNGE2NDkzM2YzNTk4NmVlMDczIn0%3D; glidedsky_session=eyJpdiI6IkxoSGlhRFhIcVhvQVBRMmltN2Z3SFE9PSIsInZhbHVlIjoiQ1p6TlBwWnNEYXFxRnFoNXRacVRBaVcyeE9QdFp6Y1BKaG8xbGw0RVp4bzFDSHloSExiS05FaHZNMUtMclJnTCIsIm1hYyI6ImVjZWQzMzZhZGI4YzYzMWNiZmNkZWVlNGFiZDQxMGRkZDkzNzg4OWQwZTY5NTYzMGE5YzRiNzA0NWU4YmEwMDQifQ%3D%3D; Hm_lpvt_020fbaad6104bcddd1db12d6b78812f6=1600067459', # 这里换上自己账号的cookies
        }

    response = requests.get(url, headers=headers, timeout=20)
    # print(response.content.decode())
    html = etree.HTML(response.content.decode())
    divs = html.xpath('//div[@class="row"]/div[@class="col-md-1"]')
    result = 0
    # print(len(divs))
    for div in divs:
        clas = div.xpath('./div')
        a_items = []
        # 获取每个数字所在位置
        left = 1  # 数字第几位
        for cla in clas:
            class_name = cla.xpath('./@class')
            res = class_name[0] + '.*{( .* )?}'
            div_num = cla.xpath('./text()')  # 获取每一个的值
            class_value = re.findall(res, response.content.decode())
            item = {}
            for value in class_value:
                vals = value.split(':')

                item[vals[0].strip()] = vals[1].strip().strip('em')
            valu = False
            if 'opacity' in item:
                if item['opacity'] == 0:
                    # 隐藏元素
                    pass

            elif 'content' in item:
                # 该位置显示的最后图片
                result = item['content']
                # print(type(result), int(result.strip('"')))
                items.append(int(result.strip('"')))
            elif 'left' in item:
                valu = left + int(item['left'])  # 元素移动之后所在位置
            else:
                valu = left  # 元素保持原来的位置

            if valu:
                left += 1
                item_num = {}
                item_num['num'] = ''.join(div_num)
                item_num['valu'] = valu
                a_items.append(item_num)


        if a_items:
            a_items.sort(key=itemgetter('valu'), reverse=False)
            nums = ''
            for item in a_items:
                num = item['num']
                nums += num
            items.append(int(nums))
    # 这一页所有的数据列表
    print(items)
    return items

def main():
    list = []
    total = 0
    for i in range(1,1001):
        url = 'http://www.glidedsky.com/level/web/crawler-css-puzzle-1?page={}'.format(i)
        for i in gao(url):
            list.append(i)
    print(list)
    for ele in range(0, len(list)):
        total = total + list[ele]
    print('合为:',total)



if __name__ == '__main__':
    main()

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值