多线程爬虫实战–彼岸图网壁纸爬取
普通方法爬取
import requests
from lxml import etree
import os
from urllib import request
headers={
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36'
}
def parse_kind():
response = requests.get('http://pic.netbian.com/',headers=headers)
response.encoding=response.apparent_encoding
html = etree.HTML(response.text)
kinds = html.xpath('//div[@class="classify clearfix"]//a')
kind_list = []
for kind in kinds:
href = kind.get('href')
title = kind.get('title')
kind_list.append([title,href])
return kind_list
def parse_page(kind_list):
for i in kind_list:
url_f = 'http://pic.netbian.com'+i[1]
root = 'd:\\imgs\\'+i[0]+'\\'
os.mkdir(root)
for x in range(1,101):
if x==1:
url = url_f
if x!=1:
url =url_f+'index_'+str(x)+'.html'
response = requests.get(url,headers=headers)
response.encoding=response.apparent_encoding
html = etree.HTML(response.text)
img_u = html.xpath('//ul[@class="clearfix"]//a/@href')
for x in img_u:
r = requests.get('http://pic.netbian.com'+x,headers=headers)
r.encoding=r.apparent_encoding
h = etree.HTML(r.text)
img_urls = h.xpath('//div[@class="photo-pic"]//img')
for i in img_urls:
img_url=i.get('src')
img_name =i.get('alt')
request.urlretrieve('http://pic.netbian.com'+img_url,root+img_name+'.jpg')
def main():
kind_list=parse_kind()
parse_page(kind_list)
if __name__ == '__main__':
main()
多线程方法爬取
import requests
from lxml import etree
import os
from urllib import request
from queue import Queue
import threading
headers={
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36'
}
def parse_kind():
response = requests.get('http://pic.netbian.com/',headers=headers)
response.encoding=response.apparent_encoding
html = etree.HTML(response.text)
kinds = html.xpath('//div[@class="classify clearfix"]//a')
img_kinds = []
roots = []
for kind in kinds:
href = kind.get('href')
title = kind.get('title')
url_f = 'http://pic.netbian.com'+href
root = 'd:\\imgs\\'+title+'\\'
os.mkdir(root)
img_kinds.append(url_f)
roots.append(root)
return img_kinds,roots
class Producer(threading.Thread):
headers={
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36'
}
def __init__(self,page_queue,img_queue,*args,**kwargs):
super(Producer,self).__init__(*args,**kwargs)
self.page_queue=page_queue
self.img_queue=img_queue
def run(self):
while True:
if self.page_queue.empty():
break
url = self.page_queue.get()
self.parse_page(url)
def parse_page(url,roots):
response = requests.get(url,headers=self.headers)
response.encoding=response.apparent_encoding
html = etree.HTML(response.text)
img_urls = html.xpath('//ul[@class="clearfix"]//a/@href')
names = html.xpath('//ul[@class="clearfix"]//b/text()')
for x,y,z in zip(img_urls,names,roots):
img_url = 'http://pic.netbian.com'+x
filename = z+y+'.jpg'
self.img_queue.put((img_url,filename))
class Consumer(threading.Thread):
def __init__(self,page_queue,img_queue,*args,**kwargs):
super(Consumer,self).__init__(*args,**kwargs)
self.page_queue=page_queue
self.img_queue=img_queue
def run(self):
while True:
if self.img_queue.empty()and self.page_queue.empty():
break
img_url,filename=self.img_queue.get()
request.urlretrieve(img_url,filename)
print(filename+"下载完成")
def main():
page_queue = Queue(100)
img_queue = Queue(500)
img_kinds,roots = parse_kind()
for url_f in img_kinds:
for x in range(1,101):
if x==1:
url = url_f
if x!=1:
url =url_f+'index_'+str(x)+'.html'
page_queue.put(url)
for x in range(5):
t = Producer(page_queue,img_queue)
t.start()
for x in range(5):
t = Consumer(page_queue,img_queue)
t.start()
if __name__ == '__main__':
main()