#!/usr/bin/python
# -- coding: utf-8 --
from html.parser import HTMLParser
import urllib.error
import urllib.request
import os, sys
import socket
from urllib import request
def out_log(logfile, message):
with open(logfile, 'a') as log:
log.write(message+'\n')
class myparser(HTMLParser):
'''找到a标签并把属性的值放到列表里'''
def __init__(self):
HTMLParser.__init__(self)
self.links = []
def handle_starttag(self, tag, attrs):
if tag == 'a':
if len(attrs) == 0:
pass
else:
for (variable, value) in attrs:
# print(variable, '=========', value)
if variable == 'href':
if value in ['?C=N&O=D', '?C=M&O=A', '?C=S&O=A', '?C=D&O=A', '?C=N&O=A']:
pass
else:
self.links.append(value)
def callbackfunc(blocknum, blocksize, totalsize):
'''回调函数,打印下载进度
@blocknum: 已经下载的数据块
@blocksize: 数据块的大小
@totalsize: 远程文件的大小
'''
percent=0
try:
percent = int(100.0 * blocknum * blocksize / totalsize)
except:
pass
if totalsize > 505528:
pass
else:
percent = 100
sys.stdout.write('\r')
out_log('G:\\HK\\simple\\out.log', '\r')
out_log('G:\\HK\\simple\\out.log', file_name + percent * '>' + str(percent) + '%')
sys.stdout.write(file_name + percent * '>' + str(percent) + '%')
sys.stdout.flush()
def create_dir(root_tree,catalog):
'''根据url的目录结构在本地穿件文件夹'''
os.chdir(root_tree)
try:
out_log('G:\\HK\\simple\\out.log', '创建目录:' + catalog)
os.makedirs(catalog)
except FileExistsError as e:
pass
def download_file(url, down_path):
'''下载文件保存到相应的目录,并把下载失败的放在一个字典里'''
global file_name
global error_download
file_name = url.split('/')[-1]
error_download = {}
socket.setdefaulttimeout(600)
try:
if os.path.exists(down_path):
out_log('G:\\HK\\simple\\out.log', os.path.join(down_path) + 'have existed')
print(os.path.join(down_path), ' have existed')
pass
# elif os.path.isdir(down_path):
# out_log('G:\\HK\\ios_FT\\out.log', os.path.join(down_path) + 'have existed')
# print(os.path.join(down_path), ' have existed')
else:
print('download:')
out_log('G:\\HK\\packages\\out.log', 'download:')
request.urlretrieve(url, down_path, callbackfunc)
except socket.gaierror as e:
error_download[url] = down_path
out_log('G:\\HK\\simple\\out.log', 'socket.gaierror:'+url)
print('socket.gaierror', url)
except urllib.error.URLError as e:
error_download[url] = down_path
out_log('G:\\HK\\simple\\out.log', 'urllib.error.URLError'+ url)
print('urllib.error.URLError', url)
sys.stdout.write('\n')
def get_url_tree(url_tree):
''' 获取一个字典,链接:目录,并把文件夹创建及把文件下载 '''
url_tree_dict = {}
level = 0
for url in url_tree:
try:
response = request.urlopen(url)
page = response.read().decode('utf-8')
hp = myparser()
hp.feed(page)
hp.close()
except urllib.error.URLError as e:
print(e)
pass
try:
# for i in hp.links:
# if '../' in hp.links[i]:
# hp.links[i] = hp.links[i].split('../')[-1]
hp.links.remove("../")
except ValueError as e:
pass
for file in hp.links:
if '/' in file:
create_dir(url_tree[url], file)
url_tree_dict[url+file] = url_tree[url]+file
else:
download_file(url+file, url_tree[url]+file)
if file.find('/') > 0:
level += 1
return url_tree_dict, level
url_tree = {"http://archive.kylinos.cn/kylin/KYLIN-ALL/": 'G:\\HK\\ios_FT\\'}
# url_tree = {"https://mirrors.aliyun.com/pypi/packages/": 'G:\\HK\\packages\\'}
try:
os.makedirs('G:\\HK\\ios_FT\\')
os.makedirs('G:\\HK\\packages\\')
except FileExistsError as e:
print(e)
# pass
while True:
url_tree, level = get_url_tree(url_tree)
if level == 0:
break
print(url_tree, level)
for key in error_download:
download_file(key, error_download[key])