自动下载Dive into Python 3网页及其相关链接
# -*- coding=utf-8 -*-
import os
import urllib
import re
#1. 下载种子;从指定种子网页开始自动下载,递归下载有效链接
source_link = 'http://woodpecker.org.cn/diveintopython/toc/index.html'
f = open(source_link.split("/")[-1], 'w')
print 'save to file : ', os.getcwd()
page = urllib.urlopen(source_link)
page_content = page.read()
#找到有效链接
# . 匹配任意除换行符外的字符
# * 匹配前一个字符0次到无限次
# + 匹配前一个字符1次到无限次
# [] 对应位置可以是字符集中的任意字符
# [^] 对应位置不是字符集中的任意字符
m = re.findall(r'<a href="([^"#]+)".*>.*</a>', page_content)
mm = []
for mi in m:
if (mi not in mm) and (mi.endswith('html')):
mm.append(mi)
f.write(page_content)
f.close()
#2. 下载有效链接页面
print "%d pages to be downloaded "%(len(mm))
for mmi in mm:
#os.path.dirname("a/b/c.txt") ==> a/b
#os.path.basename("a/b/c.txt") ==> c.txt
sub_link = os.path.dirname(source_link) + "/" + mmi
f = open(sub_link.split("/")[-1], 'w')
page = urllib.urlopen(sub_link)
page_content = page.read()
f.write(page_content)
f.close()
print '.',