python学习笔记10-下载图片或pdf到本地和调用迅雷下载

最新推荐文章于 2024-07-20 03:47:16 发布

虚幻时空

最新推荐文章于 2024-07-20 03:47:16 发布

阅读量975

点赞数

分类专栏： python 文章标签： python

本文链接：https://blog.csdn.net/wzhang1987/article/details/106425777

版权

python 专栏收录该内容

15 篇文章 0 订阅

订阅专栏

之前简单代码里有爬标题的示例，这次看了些目录相关的内容，就想爬一下图片（类似漫画那种）保存到本地，以及调用迅雷的方法。
注意该示例没有筛选链接的步骤，因为我已经知道了目标链接的形式，所以仅仅是下载方面的代码。

后面又接到一个需求下载pdf，同时目标网站还要登录，这里又记录了一下。

# -*- coding:UTF-8 -*-

import urllib.request
import requests,json,re,os,sys,time
from bs4 import BeautifulSoup
if __name__ == '__main__':
	 #忽略警告
     requests.packages.urllib3.disable_warnings()
	#设定目标章节id
     a=217940
     b=1
     i=a
     while i <=217950 and i >=217900:
     	#设置需要保存在本地的路径
         path2=(r'C:\Users\aaa\Desktop\test2')
         #这里匿了
         target = 'https://xxx.com/bookimages/123/' +str(i)+'/1.jpg'
         #判断章节页面是否存在
         req = requests.get(url=target, verify=False).status_code
         print(target)
         i+=1
         if req==200:
            j=1
            #按章节页面一页页下载
            while j in range(1,100):
                target2='https://xxx.com/bookimages/123/' +str(i)+'/'+str(j)+'.jpg'
                req2 = requests.get(url=target2, verify=False)
                j+=1
                #如果页面存在则下载
                if req2.status_code==200:
                    print(target2)
                    #这里需要按照章节保存所以新设路径
                    path3=os.path.join(path2+'\\'+str(i))
                    #注意这个章节路径如果不存在一定要新建，否则无法写入，会报文件不存在错误，实际是路径不存在。
                    if os.path.exists(path3)==False:
                        os.mkdir(path3)
                    with open(os.path.join(path3,str(j)+'.jpg'),'wb') as f:
                        f.write(req2.content)
                    time.sleep(3)
                else: break



#注意以下方法都要在迅雷软件中设置1.免打扰。2.直接开始任务不跳弹窗。
'''
#迅雷下载方法一
#用win32com.client,由于我是py3.8，好像包括win32api，win32gui，win32con都不能用，所以试不了。
import win32com.client

thunder = win32com.client.Dispatch('ThunderAgent.Agent64.1')
thunder.AddTask(url,name)
thunder.CommitTAsks()
'''
'''
#迅雷下载方法二
#用cmd的方式
import os
url="ftp://ygdy8:ygdy8@y201.dygod.org:1132/[阳光电影www.ygdy8.com].冬眠.BD.720p.中文字幕.rmvb"
os.system(r'"D:\gongju\xunlei\Program\Thunder.exe" {url}'.format(url=url))

'''
'''
#附带解析迅雷链接

import base64
url='thunder://QUFodHRwOi8vZGwwMi55dXRvdS50djo5MjAvMTExMC9bMDHniYjlgJrlpKnlsaDpvpnorrBd56ysNDLpm4YvWzAx54mI5YCa5aSp5bGg6b6Z6K6wXeesrDQy6ZuGLm1wNFpa'
strb=url.lstrip('thunder://')#去掉
urlb=base64.b64decode(strb)#解码
strurl=urlb.decode('utf-8')#重编码
zsrul=strurl.strip('AAZZ')#去掉头尾AAZZ
print(zsrul)
'''

#登录下载pdf
import requests,json,os,time,shutil


if __name__ == '__main__':

	#这个是切片筛选方法，比较简陋。。。，需要根据实际情况调整
    def content(html):
        str = '<a class="information"\r\n                       href="'
        content = html.partition(str)[2]
        # print(content)
        str2 = '">\r\n'
        content2 = content.partition(str2)[0]

        # print(content2)

        content3 = (content.partition(str2)[2]).partition(str)[2]
        # print(content3)
        content4 = content3.partition(str2)[0]
        # print(content4)
        return content2, content4  # 得到下载pdf的链接

    i=0
    while i<5000:
     target = 'http://www.xxx.com.cn/detail/id/'+str(i)+'/'
     cookie_str =r'xxx'#这里是关键，需要一个有效的cookies，可以注册登录目标网站然后进入需要下载的页面通过F12获取
     cookies = {}
     for line in cookie_str.split(';'):
         key, value = line.split('=', 1)
         cookies[key] = value
     #print(cookies)
     headers ={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.164 Safari/537.36'}
     req = requests.get(url=target,verify=True,headers=headers,cookies=cookies)
     #print(req.content)
     #如果是200就成功一大半了
     if req.status_code==200:
        str0,str1=content(str(req.content,encoding='UTF-8'))
#print(str0)


        target2='http://www.xxx.com.cn/'+str0
        target3 = 'http://www.xxx.com.cn/' + str1
       #上面的target都是形如http://www.xxx.com/123.pdf的链接
       #然后下面的req注意加了stream=True这个关键字，后面才能保存
        req2=requests.get(url=target2,stream=True,verify=True,headers=headers,cookies=cookies)
        req3 = requests.get(url=target2, stream=True, verify=True, headers=headers, cookies=cookies)
#req2.raw.decode_content = True
        with open(os.path.join('E:\\cxoxa\\'+str0.partition('files/')[2]), 'wb') as f:
            f.write(req2.content)
            print(str0.partition('files/')[2])
        time.sleep(3)
        with open(os.path.join('E:\\cxoxa\\'+str1.partition('files/')[2]), 'wb') as f:
            f.write(req3.content)
            print(str1.partition('files/')[2])
     time.sleep(3)
     print(i)

     i+=1