爬虫10+练习 1-glidedsky基础两篇爬虫

glidedsky基础两篇爬虫
第一篇
在这里插入图片描述
查看数字的class类型

在这里插入图片描述
查看哪个包会返回登录状态,显而易见是crawler-basic-1,GET包
在这里插入图片描述
下面是代码部分

import requests
from lxml import etree
header = {

  'Cookie': '_ga=GA1.2.1184184785.1603891776; _gid=GA1.2.1035086129.1603891776; __gads=ID=dbb3e55c8f8b5dbd-22d7259d6ac40095:T=1603891776:RT=1603891776:S=ALNI_MZKrP2YEAaLi4sr_yZZlR0YP5jX_w; footprints=eyJpdiI6IjA2dXptbzNaa0ZRazZCdklJYzE2Z3c9PSIsInZhbHVlIjoienFXM1hVTHFIR2NsYzVnT3FiSXVHU3ZPbWpFbGhaMTVWMFwvU0JJVW9aNlZRVlJjMGZcL0tPUm5qN2swZFJxeG9hIiwibWFjIjoiYmNiZGZhMWZiZGVlYWY0ZjY0OTk0YzU5YjExMGYxZTJjZmIzZWM4N2Y3ZDYyYzExNzA5ZGNjNTlkY2I0OTZhNyJ9; Hm_lvt_020fbaad6104bcddd1db12d6b78812f6=1603891776,1603929984; remember_web_59ba36addc2b2f9401580f014c7f58ea4e30989d=eyJpdiI6Ik1nSURHTEZ0K3dMVFFSSFwveUhcL1pPUT09IiwidmFsdWUiOiI3enZLbVJXQVFcL2RRYUl4UlorZU4zNmVwYWRmZktWZEhVbkdiNXN6Z0dPUTVUdXFLc1B6bmZTb0tzKzYrbit0OGNzWlk1cmFWbHoxYTBCM2tXbjM1QWpHcVZCaFR5TXVmNXNnNXR2cm1FRWlcL3lBMUFLNzgzMWlZYUFWY1VuVXJuN01mdUdydWJoSWMzeVFcL0Rad0VxTjJBSExIR3Nla3VWSFd6VmRKQ1wvRlBRPSIsIm1hYyI6IjM2YmM5OWU2MmE4NWNjODhjNTE3MzFkYzQyMWM2ZmI1YzlhZTliZGJhMjgwMDYzNGM0NWUzZDk0ODU0ZWFkOWYifQ%3D%3D; XSRF-TOKEN=eyJpdiI6ImR6dUVCa2lSK3NET0kyRnc2TGtCamc9PSIsInZhbHVlIjoiQkNlRU1ta3dnQ2lYNFpTa1h5VmxJblJRY29Vem5IVUIwK2NFSXcwMWxGS2tpcjM5RTZ0K3p5OUpMNEJhSDZHeCIsIm1hYyI6ImQzNTc5ZmMwZDhiYzJiZDBkNjBjNWIxZDM4MWFkYzY3OGZjZTUzZjU3OTgyMDVhNTBiMzFmNjg1MmE1Njk0MTUifQ%3D%3D; glidedsky_session=eyJpdiI6Ik1VRkR3Q2VoQ3lvMHdUNGZvXC80XC9EZz09IiwidmFsdWUiOiJ2NmtYYTFwdnJibVU2elh6V01LRzUrcEVSMUwwNE1jVXFSK1JXQitYUk92VjU2b1RkMzh1ODZIRGRiSE1KUWtLIiwibWFjIjoiNGFiODUyZGZkZDFkZTlmMGIzNTdlNjViNzFlNDJhYTVkM2RmMjBjNWEzOTg4MmRhMTFlZmU1MTJiMDg3NmVmZCJ9; Hm_lpvt_020fbaad6104bcddd1db12d6b78812f6=1603930005',
'Host': 'glidedsky.com',
'Referer': 'http://glidedsky.com/level/crawler-basic-1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36'

}
def run(url):
    res = requests.get(url,headers=header)
    html = etree.HTML(res.text)
    numbers = html.xpath("//div[@class='col-md-1']/text()")
    totle = 0
    for number in numbers:
        totle+=int(number)
    print(totle)


if __name__ == "__main__":
  run('http://glidedsky.com/level/web/crawler-basic-1')

密码是啥呢?300309

第二篇
在这里插入图片描述
还是先看数字Class类型,发现没有改变
数据包变为了crawler-basic-2,点击第二页,发现为crawler-basic-2?page=2,再点击回去,为crawler-basic-2?page=1,那么1000页呢,规律找到了
代码部分(注释为原方法,时间需两分钟,建议使用多线程)

import requests
import bs4
from bs4 import BeautifulSoup

header = {

        'Cookie': '_ga=GA1.2.1184184785.1603891776; _gid=GA1.2.1035086129.1603891776; __gads=ID=dbb3e55c8f8b5dbd-22d7259d6ac40095:T=1603891776:RT=1603891776:S=ALNI_MZKrP2YEAaLi4sr_yZZlR0YP5jX_w; footprints=eyJpdiI6IjA2dXptbzNaa0ZRazZCdklJYzE2Z3c9PSIsInZhbHVlIjoienFXM1hVTHFIR2NsYzVnT3FiSXVHU3ZPbWpFbGhaMTVWMFwvU0JJVW9aNlZRVlJjMGZcL0tPUm5qN2swZFJxeG9hIiwibWFjIjoiYmNiZGZhMWZiZGVlYWY0ZjY0OTk0YzU5YjExMGYxZTJjZmIzZWM4N2Y3ZDYyYzExNzA5ZGNjNTlkY2I0OTZhNyJ9; Hm_lvt_020fbaad6104bcddd1db12d6b78812f6=1603891776,1603929984; remember_web_59ba36addc2b2f9401580f014c7f58ea4e30989d=eyJpdiI6Ik1nSURHTEZ0K3dMVFFSSFwveUhcL1pPUT09IiwidmFsdWUiOiI3enZLbVJXQVFcL2RRYUl4UlorZU4zNmVwYWRmZktWZEhVbkdiNXN6Z0dPUTVUdXFLc1B6bmZTb0tzKzYrbit0OGNzWlk1cmFWbHoxYTBCM2tXbjM1QWpHcVZCaFR5TXVmNXNnNXR2cm1FRWlcL3lBMUFLNzgzMWlZYUFWY1VuVXJuN01mdUdydWJoSWMzeVFcL0Rad0VxTjJBSExIR3Nla3VWSFd6VmRKQ1wvRlBRPSIsIm1hYyI6IjM2YmM5OWU2MmE4NWNjODhjNTE3MzFkYzQyMWM2ZmI1YzlhZTliZGJhMjgwMDYzNGM0NWUzZDk0ODU0ZWFkOWYifQ%3D%3D; XSRF-TOKEN=eyJpdiI6Ik1oeFg1S01jQXdxN05BRmxrWmhpYnc9PSIsInZhbHVlIjoibUZDcUZ6cG80YWMyXC9pSytXdVI3YWdLK2dLTlN6OTk1WFwvVzdxNUFxTGl3bXpPQW0xNk5VNnR3Nlc2REN5bHVIIiwibWFjIjoiZDIzNTE4YzMzMjQzYmFiMmVmNmFhMDdkNWJhNWQwNTVmMjQyZWQ1ZDcxYWFmYTI2ZjM2MWE2MzVmOTJhMTY3ZiJ9; glidedsky_session=eyJpdiI6IjVFSzR3UGdGS2h5MDVyNDM4ZCtxR0E9PSIsInZhbHVlIjoiaVwvOEdyUVwvYU9rdUw0MDFjZzlrWXBhQ2pYREVUWEhkUTRTSTFEQnMrMGJEcXh4dHNGbWJld0J3RW9yd3Baa29DIiwibWFjIjoiZTkyODVjNGI1MWY3NjgwOTA4Yzc1MjRhOTg0NGM2Y2NlN2ZjNjViMTQ4ZTBmZjYxMDAwZWNiOWMyZWE5ZGM4YyJ9; Hm_lpvt_020fbaad6104bcddd1db12d6b78812f6=1603931070',
        'Host': 'glidedsky.com',
        'Referer': 'http://glidedsky.com/level/crawler-basic-2',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36'
    }
number = 0


# def run(url):
#     res = requests.get(url, headers=headers)
#     html = etree.HTML(res.text)
#     numbers = html.xpath("//div[@class='col-md-1']/text()")
#     totle = 0
#     for number in numbers:
#         totle += int(number)
#
#     return totle

def getNumber(response):
        # 引入全局变量number
        global number
        # 解析
        data = BeautifulSoup(response.text, "lxml")
        # 获取全部class值为col-md-1的div,然后遍历
        div_list = data.find_all(class_="col-md-1")
        for div in div_list:
            d = BeautifulSoup(str(div), "lxml")
            # 取得文本值,并去掉前后空格
            num = d.text.strip()
            number += int(num)


# 开始
# if __name__ == "__main__":
#     COUNT = 0
#     for i in range(1, 1001):
#         url = f'http://glidedsky.com/level/web/crawler-basic-2?page={i}'
#         COUNT += run(url)
#
#     print(COUNT)
if __name__ == '__main__':
        # 采用for循环,爬取1000数据
  for i in range(1000):
            print("第" + str(i + 1) + "页")
            url = "http://glidedsky.com/level/web/crawler-basic-2?page=" + str(i + 1)
            response = requests.get(url=url, headers=header)
            getNumber(response)
  print(number)

密码是啥呢?3356211
幸运眷顾每一个看到最后的朋友,接下来我会更新十个爬虫练习,偏基础,大佬勿喷,cookie值是不同的,所以需自行修改哦!

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值