[Python]_[批量下载网站文件]

最新推荐文章于 2024-07-25 08:08:29 发布

Peter(阿斯拉达)

最新推荐文章于 2024-07-25 08:08:29 发布

阅读量6.8k

点赞数

分类专栏：语言特性 Python-完全自动化文章标签： python 下载 pdf image

本文链接：https://blog.csdn.net/infoworld/article/details/9337619

版权

Python-完全自动化同时被 2 个专栏收录

19 篇文章 0 订阅

订阅专栏

语言特性

16 篇文章 0 订阅

订阅专栏

场景:

1.有时候需要下载某个网站上提供的所有pdf文件，貌似没发现哟下载工具提供。

#! encoding=utf-8

import urllib2
import re
import os

def Download(url,output):
    print "downloading..."+url
    response = urllib2.urlopen(url)
    resourceFile = open(output,"wb")
    resourceFile.write(response.read())
    resourceFile.close()
    print "downloaded"

def Action(url,ext = "pdf",output = "."):
    
    #1.domain
    index = url.rfind("/");
    domain = url[0:index+1];
    print domain
    request = urllib2.Request(url)
    response = urllib2.urlopen(request)
    
    #2.content
    content = response.read()
#    print content
    
    #3.resource
    mode = '\"([^\"]+'+ext+')\"'
    pattern = re.compile(mode)
    strMatch = pattern.findall(content)
    size = len(strMatch)
    print "file num: "+str(size)
    for i in range(0,size,1):
#        print strMatch[i]
        one = strMatch[i]
        partIndex = one.rfind('/')
        if not one.startswith('http://'):
            if -1!=partIndex:
                directDir = one[0:partIndex+1]
            else:
                directDir = ""
#            print directDir
            try:
                os.makedirs(output+"/"+directDir)
            except Exception,e:
                pass
            fileUrl = domain+one
            fileOutput = output+"/"+one
            print fileUrl
            print fileOutput
            Download(fileUrl,fileOutput)
        else:
            print one
            print "........."
            print one[partIndex:]
            fileOutput = output+"/"+one[partIndex:]
            print fileOutput
            Download(one,fileOutput)
    #5.download

if __name__=='__main__':
    print "download"
    url = "http://compgeom.cs.uiuc.edu/~jeffe/teaching/algorithms/";
    Action("http://tech.qq.com/","jpg");