多线程执行爬虫避免某个网络资源卡住其他资源下载;
Python线程相关知识点:
import threading 引入线程
t = Thread(target,args=None) 定义一个线程
t.start() 线程开始
t.setDaemon(False) 默认 设置线程后台模式运行;
t.setDaemon(True) 设置线程前台模式运行;
t.join (当前程序)等待线程t执行完毕;
lock=threading.RLOCK() 创建线程锁对象
lock.acquire() 强迫lock获取线程锁,如果被占用则等待
lock.release() 释放锁
from bs4 import BeautifulSoup
from bs4 import UnicodeDammit
import urllib.request
import threading
def imageSpider(start_url):
global threads
global count
try:
urls=[]
req=urllib.request.Request(start_url,headers=headers)
data = urllib.request.urlopen(req)
data=data.read()
dammit=UnicodeDammit(data,["utf-8","gbk"])
data=dammit.unicode_markup
soup=BeautifulSoup(data,"html.parser")
images=soup.select("img")
print (images)
for image in images:
try:
src=image["src"]
url = urllib.request.urljoin(start_url,src)
if url not in urls:
urls.append(url)
print (url)
count=count+1
T=threading.Thread(target=download,args=(url,count))
T.setDaemon(False)
T.start()
threads.append(T)
except Exception as err:
print (err)
except Exception as err:
print (err)
def download(url,count):
try:
if(url[len(url)-4]=="."):
ext=url[len(url)-4:]
else:
ext=".jpg"
req=urllib.request.Request(url,headers=headers)
data = urllib.request.urlopen(req,timeout=100)
data=data.read()
fobj=open("images\\"+str(count)+ext,"wb")
fobj.write(data)
fobj.close()
print ("downloaded"+str(count)+ext)
except Exception as err:
print (err)
start_url="https://www..net/"
headers={"User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 6.0 x64; en-US; rv:1.9pre) Gecko/2008072421 Minefield/3.0.2pre"}
count=0
threads=[]
imageSpider(start_url)
for t in threads:
t.join()
print("the End")