修改之前的爬虫,改为多线程下载,进行了简单粗暴的修改
import threading
import time
import queue
import urllib.request
import re
import os
#打开网页
def openURL(web):
headers={'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36 QIHU 360SE'}
req=urllib.request.Request(url=web,headers=headers)
data=urllib.request.urlopen(req)
return data.read().decode('UTF-8')
#提取子页面链接及子页面名称
def filterHTML(html):
result=[]
base_link='http://www.ivsky.com'
link_re = re.compile(r'class="il_img"><a href="(.*?)" title="(.*?)"')
for link,title in link_re.findall(html):
link=base_link+link
result.append((link,title))
return result
#在子页面提取图片地址
def searchHTML(html):
result=[]
base_link='http://img.ivsky.com'
msg_re=re.compile(r'arctitle=.*\((.*?)张.*?imgURL=\'(.*?)\'.*?aid=\'(.*?)\'')
num,url,name=msg_re.findall(html)[0]
#print('num=%s\nurl=%s\nname=%s' % (num,url,name))
num=int(num)
link=base_link+url
result.append((name,link))
pos=link.rfind('.')
part1=link[:pos]
part2=link[pos:]
#判断是否有'-001'编号
if len(part1.split('-'))==1:
begin=1
else:
begin=2
part1=part1.split('-')[0]
for i in range(begin,num):
sign='-%03d' % i
new_name = str(int(name)+i)
new_link = part1+sign+part2
result.append((new_name,new_link))
return result
#每个线程调用函数down_pic,从给出的链接处下载图片
def down_pic(url,path):
sub_html=openURL(url)
link=filterHTML(sub_html)[0][0]
html=openURL(link)
for pic_name,pic_url in searchHTML(html):
try:
pic_name+='.jpg'
print('%s:url=%s' %(threading.current_thread().name,pic_url))
urllib.request.urlretrieve(pic_url,os.path.join(path,pic_name))
except:
print('下载失败')
print('%s 下载完成' % (os.path.split(path)[-1]))
def main():
base_path='PicDownload'
if not os.path.isdir(base_path):
os.makedirs(base_path)
web='http://www.ivsky.com/tupian/index_5.html'
q=queue.Queue()
html=openURL(web)
for link,title in filterHTML(html):
#建立文件夹
path=os.path.join(base_path,title)
if not os.path.isdir(path):
os.makedirs(path)
q.put((link,path))
while not q.empty():
sub_web=q.get()
threading.Thread(target=down_pic,args=(sub_web[0],sub_web[1])).start()
if __name__=='__main__':
main()