多线程爬虫
所谓多线程,即程序中的某些程序段并行执行,合理地设置多线程 ,可使爬虫效率更高。
例:普通的糗事百科段子爬虫
import urllib.request
import re
import urllib.error
headers=("User-Agent","Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36")
opener=urllib.request.build_opener()
opener.addheaders=[headers]
urllib.request.install_opener(opener)
for i in range(1,3):
url="https://www.qiushibaike.com/text/page/"+str(i)
pagedata=urllib.request.urlopen(url).read().decode("utf-8","ignore")
pat='<div class="content">.*?<span>(.*?)</span>.*?</div>'
datalist=re.compile(pat,re.S).findall(pagedata)
for j in range(0,len(datalist)):
print("第"+str(i)+"页第"+str(j)+"个段子内容是:")
print(datalist[j])
建立线程的基本格式:
import threading
class A(threading.Thread):
def __init__(self):
threading.Thread.__init__(self)
def run(self):
for i in range(0,10):
print("我是线程A")
class B(threading.Thread):
def __init__(self):
threading.Thread.__init__(self)
def run(self):
for i in range(0,10):
print("我是线程B")
t1=A()
t1.start()
t2=B()
t2.start()
例:将糗事百科爬虫改为多线程爬虫
import urllib.request
import threading
import re
import urllib.error
headers=("User-Agent","Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36")
opener=urllib.request.build_opener()
opener.addheaders=[headers]
urllib.request.install_opener(opener)
class One(threading.Thread):
def __init__(self):
threading.Thread.__init__(self)
def run(self):
for i in range(1,10,2):
url="https://www.qiushibaike.com/text/page/"+str(i)
pagedata=urllib.request.urlopen(url).read().decode("utf-8","ignore")
pat='<div class="content">.*?<span>(.*?)</span>.*?</div>'
datalist=re.compile(pat,re.S).findall(pagedata)
for j in range(0,len(datalist)):
try:
print("第"+str(i)+"页第"+str(j)+"个段子内容是:")
print(datalist[j])
except urllib.error.URLError as e:
if hasattr(e,"code"):
print(e.code)
if hasattr(e,"reason"):
print(e.reason)
except Exception as e:
print(e)
class Two(threading.Thread):
def __init__(self):
threading.Thread.__init__(self)
def run(self):
for i in range(2,11,2):
url="https://www.qiushibaike.com/text/page/"+str(i)
pagedata=urllib.request.urlopen(url).read().decode("utf-8","ignore")
pat='<div class="content">.*?<span>(.*?)</span>.*?</div>'
datalist=re.compile(pat,re.S).findall(pagedata)
for j in range(0,len(datalist)):
print("第"+str(i)+"页第"+str(j)+"个段子内容是:")
print(datalist[j])
a=One()
a.start()
b=Two()
b.start()
使用多线程可交叉运行,比单线程运行时间短,速度更快