python下载

最新推荐文章于 2024-10-03 09:02:12 发布

chuanwang66

最新推荐文章于 2024-10-03 09:02:12 发布

阅读量125

点赞数

分类专栏： python 文章标签： python

本文链接：https://blog.csdn.net/chuanwang66/article/details/84505431

版权

python 专栏收录该内容

13 篇文章 0 订阅

订阅专栏

自动下载Dive into Python 3网页及其相关链接

# -*- coding=utf-8 -*-
import os
import urllib
import re

#1. 下载种子；从指定种子网页开始自动下载，递归下载有效链接
source_link = 'http://woodpecker.org.cn/diveintopython/toc/index.html'
f = open(source_link.split("/")[-1], 'w')
print 'save to file : ', os.getcwd()

page = urllib.urlopen(source_link)
page_content = page.read()

#找到有效链接
# . 匹配任意除换行符外的字符
# * 匹配前一个字符0次到无限次
# + 匹配前一个字符1次到无限次
# [] 对应位置可以是字符集中的任意字符
# [^] 对应位置不是字符集中的任意字符
m = re.findall(r'<a href="([^"#]+)".*>.*</a>', page_content)
mm = []
for mi in m:
    if (mi not in mm) and (mi.endswith('html')):
        mm.append(mi)

f.write(page_content)
f.close()

#2. 下载有效链接页面
print "%d pages to be downloaded "%(len(mm))
for mmi in mm:
    #os.path.dirname("a/b/c.txt") ==> a/b
    #os.path.basename("a/b/c.txt") ==> c.txt
    sub_link = os.path.dirname(source_link) + "/" + mmi
    f = open(sub_link.split("/")[-1], 'w')
    
    page = urllib.urlopen(sub_link)
    page_content = page.read()

    f.write(page_content)
    f.close()
    print '.',