多线程执行爬虫避免某个网络资源卡住其他资源下载;
Python线程相关知识点:
- import threading 引入线程
- t = Thread(target,args=None) 定义一个线程
- t.start() 线程开始
- t.setDaemon(False) 默认 设置线程后台模式运行;
- t.setDaemon(True) 设置线程前台模式运行;
- t.join (当前程序)等待线程t执行完毕;
- lock=threading.RLOCK() 创建线程锁对象
- lock.acquire() 强迫lock获取线程锁,如果被占用则等待
- lock.release() 释放锁
from bs4 import BeautifulSoup from bs4 import UnicodeDammit import urllib.request import threading def imageSpider(start_url): global threads global count try: urls=[] req=urllib.request.Request(start_url,headers=headers) data = urllib.request.urlopen(req) data=data.read() dammit=UnicodeDammit(data,["utf-8","gbk"]) data=dammit.unicode_markup soup=BeautifulSoup(data,"html.parser") images=soup.select("img") print (images) for image in images: try: src=image["src"] url = urllib.request.urljoin(start_url,src) if url not in urls: urls.append(url) print (url) count=count+1 T=threading.Thread(target=download,args=(url,count)) T.setDaemon(False) T.start() threads.append(T) except Exception as err: print (err) except Exception as err: print (err) def download(url,count): try: if(url[len(url)-4]=="."): ext=url[len(url)-4:] else: ext=".jpg" req=urllib.request.Request(url,headers=headers) data = urllib.request.urlopen(req,timeout=100) data=data.read() fobj=open("images\\"+str(count)+ext,"wb") fobj.write(data) fobj.close() print ("downloaded"+str(count)+ext) except Exception as err: print (err) start_url="https://www.csdn.net/" headers={"User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 6.0 x64; en-US; rv:1.9pre) Gecko/2008072421 Minefield/3.0.2pre"} count=0 threads=[] imageSpider(start_url) for t in threads: t.join() print("the End")