如果需要专门出一篇文章,那就请多多评论回复
你的点赞是对我的无尽支持
程序含金量虽然不高,但是也有点,大佬勿喷!
程序中的保存文件及输入链接未经处理,可以根据自己需求对代码进行修改(比如加个input什么的),还请认真看注释!!!
此爬虫已过时,但仍可参考,最新版本文章看下方
最新爬虫版本
#作者CSDN:https://blog.csdn.net/qq_45429426?spm=1011.2124.3001.5343 漫游感知
#请勿转载!!!!
#没有的库请用pip指令下载
import requests #第三方库,来发送请求等操作
import jsonpath #第三方库,来解析json数据
from lxml import etree #第三方库,来使用xpath解析网页
import re
def lancode(url):
url = str(url)
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
'Host': 'lzt666.lanzous.com',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'none',
'Sec-Fetch-User': '?1',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.150 Safari/537.36'
}
respeno = requests.get(url=url,headers=headers).text
Html_lan_xpath = etree.HTML(respeno)
two_url_yuan = Html_lan_xpath.xpath('/html/body/div[3]/div[2]/div[4]/iframe/@src')[0]
two_url_replace = str(two_url_yuan).replace('/fn?','')
two_list = [two_url_replace,two_url_yuan] #一个不带/fu?为请求头做铺垫,另一个则为请求表单做铺垫
return two_list
def get_download_url(url_list):
page_url = 'https://lzt666.lanzous.com'+url_list[1]
page_headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
'Cookie': 'codelen=1; pc_ad1=1',
'Host': 'lzt666.lanzous.com',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'none',
'Sec-Fetch-User': '?1',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.150 Safari/537.36'
}
data = {
'':url_list[0]
}
page_code_sign = requests.get(url=page_url,headers=page_headers,data=data).text
# print(page_code_sign)
txt = re.findall("'sign':(.*?),'ves'",page_code_sign)
txt_re = str(txt[1]).replace("'",'') #获取正确传参code
url_post = 'https://lzt666.lanzous.com/ajaxm.php'
headers = {
'Accept': 'application/json, text/javascript, */*',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Connection': 'keep-alive',
'Content-Length': '136',
'Content-Type': 'application/x-www-form-urlencoded',
'Cookie': 'codelen=1; pc_ad1=1',
'Host': 'lzt666.lanzous.com',
'Origin': 'https://lzt666.lanzous.com',
'Referer': 'https://lzt666.lanzous.com' + url_list[1],
'Sec-Fetch-Dest': 'empty',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Site': 'same-origin',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.150 Safari/537.36',
'X-Requested-With': 'XMLHttpRequest'
}
data = {
'action': 'downprocess',
'signs': '?ctdf',
'sign': txt_re,
'ves': '1',
'websign': None
}
post_url_download = requests.post(url=url_post,headers=headers,data=data).json()
# print(post_url_download)
url_file = jsonpath.jsonpath(post_url_download,'$..url')[0]
# print('Referer: https://lzt666.lanzous.com' + url_list[1],'sign:'+url_list[0])
download_file_url = 'https://vip.d0.baidupan.com/file/' + url_file
return str(download_file_url) #返回文件下载链接
def download(url):
'''
:param url: 需要下载的链接(前面函数已经解析好的)
:return:
'''
url = str(url)
data = {
'': url.rsplit('/')[-1].replace('?','').replace('=',':')
}
headers = {
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'zh-CN,zh;q=0.9',
'cache-control': 'max-age=0',
'cookie': 'down_ip=1',
'sec-fetch-dest': 'document',
'sec-fetch-mode': 'navigate',
'sec-fetch-site': 'none',
'sec-fetch-user': '?1',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.150 Safari/537.36'
}
file = requests.get(url=url,headers=headers,data=data).content
with open('./1.zip','wb') as file_down:
#这里测试时下载的压缩文件,所以保存就这么写了,用的时候可以根据自己的需求来改动
file_down.write(file)
file_down.close()
print('完成下载')
return True
a = lancode('https://lzt666.lanzous.com/xxxx') #在这里放入链接
b = get_download_url(a)
download(b)