from urllib import request
import re
import os
def down_html(url, fname):
r = request.urlopen(url)
with open(fname, 'wb') as fobj:
while True:
data = r.read()
if not data:
break
fobj.write(data)
def get_url(fname, patt):
cpatt = re.compile(patt)
list = []
with open(fname) as fobj:
for line in fobj:
m = re.search(cpatt, line)
if m:
list.append(m.group())
return list
if __name__ == '__main__':
save_dir = '/tmp/images'
if not os.path.exists(save_dir):
os.mkdir(save_dir)
patt = 'http://[.\w/-]+\.(jpg|jpeg|png|gif)'
html_url = 'http://www.tedu.cn/'
html_fname = '/tmp/tedu.html'
down_html(html_url, html_fname)
urls = get_url(html_fname, patt)
for url in urls:
url_fname = url.split('/')[-1]
image_fname = os.path.join(save_dir, url_fname)
down_html(url, image_fname)