glidedsky基础两篇爬虫
第一篇
查看数字的class类型
查看哪个包会返回登录状态,显而易见是crawler-basic-1,GET包
下面是代码部分
import requests
from lxml import etree
header = {
'Cookie': '_ga=GA1.2.1184184785.1603891776; _gid=GA1.2.1035086129.1603891776; __gads=ID=dbb3e55c8f8b5dbd-22d7259d6ac40095:T=1603891776:RT=1603891776:S=ALNI_MZKrP2YEAaLi4sr_yZZlR0YP5jX_w; footprints=eyJpdiI6IjA2dXptbzNaa0ZRazZCdklJYzE2Z3c9PSIsInZhbHVlIjoienFXM1hVTHFIR2NsYzVnT3FiSXVHU3ZPbWpFbGhaMTVWMFwvU0JJVW9aNlZRVlJjMGZcL0tPUm5qN2swZFJxeG9hIiwibWFjIjoiYmNiZGZhMWZiZGVlYWY0ZjY0OTk0YzU5YjExMGYxZTJjZmIzZWM4N2Y3ZDYyYzExNzA5ZGNjNTlkY2I0OTZhNyJ9; Hm_lvt_020fbaad6104bcddd1db12d6b78812f6=1603891776,1603929984; remember_web_59ba36addc2b2f9401580f014c7f58ea4e30989d=eyJpdiI6Ik1nSURHTEZ0K3dMVFFSSFwveUhcL1pPUT09IiwidmFsdWUiOiI3enZLbVJXQVFcL2RRYUl4UlorZU4zNmVwYWRmZktWZEhVbkdiNXN6Z0dPUTVUdXFLc1B6bmZTb0tzKzYrbit0OGNzWlk1cmFWbHoxYTBCM2tXbjM1QWpHcVZCaFR5TXVmNXNnNXR2cm1FRWlcL3lBMUFLNzgzMWlZYUFWY1VuVXJuN01mdUdydWJoSWMzeVFcL0Rad0VxTjJBSExIR3Nla3VWSFd6VmRKQ1wvRlBRPSIsIm1hYyI6IjM2YmM5OWU2MmE4NWNjODhjNTE3MzFkYzQyMWM2ZmI1YzlhZTliZGJhMjgwMDYzNGM0NWUzZDk0ODU0ZWFkOWYifQ%3D%3D; XSRF-TOKEN=eyJpdiI6ImR6dUVCa2lSK3NET0kyRnc2TGtCamc9PSIsInZhbHVlIjoiQkNlRU1ta3dnQ2lYNFpTa1h5VmxJblJRY29Vem5IVUIwK2NFSXcwMWxGS2tpcjM5RTZ0K3p5OUpMNEJhSDZHeCIsIm1hYyI6ImQzNTc5ZmMwZDhiYzJiZDBkNjBjNWIxZDM4MWFkYzY3OGZjZTUzZjU3OTgyMDVhNTBiMzFmNjg1MmE1Njk0MTUifQ%3D%3D; glidedsky_session=eyJpdiI6Ik1VRkR3Q2VoQ3lvMHdUNGZvXC80XC9EZz09IiwidmFsdWUiOiJ2NmtYYTFwdnJibVU2elh6V01LRzUrcEVSMUwwNE1jVXFSK1JXQitYUk92VjU2b1RkMzh1ODZIRGRiSE1KUWtLIiwibWFjIjoiNGFiODUyZGZkZDFkZTlmMGIzNTdlNjViNzFlNDJhYTVkM2RmMjBjNWEzOTg4MmRhMTFlZmU1MTJiMDg3NmVmZCJ9; Hm_lpvt_020fbaad6104bcddd1db12d6b78812f6=1603930005',
'Host': 'glidedsky.com',
'Referer': 'http://glidedsky.com/level/crawler-basic-1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36'
}
def run(url):
res = requests.get(url,headers=header)
html = etree.HTML(res.text)
numbers = html.xpath("//div[@class='col-md-1']/text()")
totle = 0
for number in numbers:
totle+=int(number)
print(totle)
if __name__ == "__main__":
run('http://glidedsky.com/level/web/crawler-basic-1')
密码是啥呢?300309
第二篇
还是先看数字Class类型,发现没有改变
数据包变为了crawler-basic-2,点击第二页,发现为crawler-basic-2?page=2,再点击回去,为crawler-basic-2?page=1,那么1000页呢,规律找到了
代码部分(注释为原方法,时间需两分钟,建议使用多线程)
import requests
import bs4
from bs4 import BeautifulSoup
header = {
'Cookie': '_ga=GA1.2.1184184785.1603891776; _gid=GA1.2.1035086129.1603891776; __gads=ID=dbb3e55c8f8b5dbd-22d7259d6ac40095:T=1603891776:RT=1603891776:S=ALNI_MZKrP2YEAaLi4sr_yZZlR0YP5jX_w; footprints=eyJpdiI6IjA2dXptbzNaa0ZRazZCdklJYzE2Z3c9PSIsInZhbHVlIjoienFXM1hVTHFIR2NsYzVnT3FiSXVHU3ZPbWpFbGhaMTVWMFwvU0JJVW9aNlZRVlJjMGZcL0tPUm5qN2swZFJxeG9hIiwibWFjIjoiYmNiZGZhMWZiZGVlYWY0ZjY0OTk0YzU5YjExMGYxZTJjZmIzZWM4N2Y3ZDYyYzExNzA5ZGNjNTlkY2I0OTZhNyJ9; Hm_lvt_020fbaad6104bcddd1db12d6b78812f6=1603891776,1603929984; remember_web_59ba36addc2b2f9401580f014c7f58ea4e30989d=eyJpdiI6Ik1nSURHTEZ0K3dMVFFSSFwveUhcL1pPUT09IiwidmFsdWUiOiI3enZLbVJXQVFcL2RRYUl4UlorZU4zNmVwYWRmZktWZEhVbkdiNXN6Z0dPUTVUdXFLc1B6bmZTb0tzKzYrbit0OGNzWlk1cmFWbHoxYTBCM2tXbjM1QWpHcVZCaFR5TXVmNXNnNXR2cm1FRWlcL3lBMUFLNzgzMWlZYUFWY1VuVXJuN01mdUdydWJoSWMzeVFcL0Rad0VxTjJBSExIR3Nla3VWSFd6VmRKQ1wvRlBRPSIsIm1hYyI6IjM2YmM5OWU2MmE4NWNjODhjNTE3MzFkYzQyMWM2ZmI1YzlhZTliZGJhMjgwMDYzNGM0NWUzZDk0ODU0ZWFkOWYifQ%3D%3D; XSRF-TOKEN=eyJpdiI6Ik1oeFg1S01jQXdxN05BRmxrWmhpYnc9PSIsInZhbHVlIjoibUZDcUZ6cG80YWMyXC9pSytXdVI3YWdLK2dLTlN6OTk1WFwvVzdxNUFxTGl3bXpPQW0xNk5VNnR3Nlc2REN5bHVIIiwibWFjIjoiZDIzNTE4YzMzMjQzYmFiMmVmNmFhMDdkNWJhNWQwNTVmMjQyZWQ1ZDcxYWFmYTI2ZjM2MWE2MzVmOTJhMTY3ZiJ9; glidedsky_session=eyJpdiI6IjVFSzR3UGdGS2h5MDVyNDM4ZCtxR0E9PSIsInZhbHVlIjoiaVwvOEdyUVwvYU9rdUw0MDFjZzlrWXBhQ2pYREVUWEhkUTRTSTFEQnMrMGJEcXh4dHNGbWJld0J3RW9yd3Baa29DIiwibWFjIjoiZTkyODVjNGI1MWY3NjgwOTA4Yzc1MjRhOTg0NGM2Y2NlN2ZjNjViMTQ4ZTBmZjYxMDAwZWNiOWMyZWE5ZGM4YyJ9; Hm_lpvt_020fbaad6104bcddd1db12d6b78812f6=1603931070',
'Host': 'glidedsky.com',
'Referer': 'http://glidedsky.com/level/crawler-basic-2',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36'
}
number = 0
# def run(url):
# res = requests.get(url, headers=headers)
# html = etree.HTML(res.text)
# numbers = html.xpath("//div[@class='col-md-1']/text()")
# totle = 0
# for number in numbers:
# totle += int(number)
#
# return totle
def getNumber(response):
# 引入全局变量number
global number
# 解析
data = BeautifulSoup(response.text, "lxml")
# 获取全部class值为col-md-1的div,然后遍历
div_list = data.find_all(class_="col-md-1")
for div in div_list:
d = BeautifulSoup(str(div), "lxml")
# 取得文本值,并去掉前后空格
num = d.text.strip()
number += int(num)
# 开始
# if __name__ == "__main__":
# COUNT = 0
# for i in range(1, 1001):
# url = f'http://glidedsky.com/level/web/crawler-basic-2?page={i}'
# COUNT += run(url)
#
# print(COUNT)
if __name__ == '__main__':
# 采用for循环,爬取1000数据
for i in range(1000):
print("第" + str(i + 1) + "页")
url = "http://glidedsky.com/level/web/crawler-basic-2?page=" + str(i + 1)
response = requests.get(url=url, headers=header)
getNumber(response)
print(number)
密码是啥呢?3356211
幸运眷顾每一个看到最后的朋友,接下来我会更新十个爬虫练习,偏基础,大佬勿喷,cookie值是不同的,所以需自行修改哦!