多线程爬虫实战--彼岸图网壁纸爬取

多线程爬虫实战–彼岸图网壁纸爬取

普通方法爬取
import requests
from lxml import etree
import os
from urllib import request

headers={
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36'
    }
def parse_kind():
    response = requests.get('http://pic.netbian.com/',headers=headers)
    response.encoding=response.apparent_encoding
    html = etree.HTML(response.text)
    kinds = html.xpath('//div[@class="classify clearfix"]//a')
    kind_list = []
    for kind in kinds:
        href = kind.get('href')
        title = kind.get('title')
        kind_list.append([title,href])
    return kind_list
def parse_page(kind_list):
    for i in kind_list:
        url_f = 'http://pic.netbian.com'+i[1]
        root = 'd:\\imgs\\'+i[0]+'\\'
        os.mkdir(root)
        for x in range(1,101):
            if x==1:
                url = url_f
            if x!=1:
                url =url_f+'index_'+str(x)+'.html'
            #print(url)
            response = requests.get(url,headers=headers)
            response.encoding=response.apparent_encoding
            #print(response.text)
            html = etree.HTML(response.text)
            img_u = html.xpath('//ul[@class="clearfix"]//a/@href')
            for x in img_u:
                r = requests.get('http://pic.netbian.com'+x,headers=headers)
                r.encoding=r.apparent_encoding
                h = etree.HTML(r.text)
                img_urls =  h.xpath('//div[@class="photo-pic"]//img')
                for i in img_urls:
                    img_url=i.get('src')
                    img_name =i.get('alt')
                    request.urlretrieve('http://pic.netbian.com'+img_url,root+img_name+'.jpg')
                    
                        

def main():
        kind_list=parse_kind()
        parse_page(kind_list)

if __name__ == '__main__':
    main()
多线程方法爬取
import requests
from lxml import etree
import os
from urllib import request
from queue import Queue
import threading

headers={
            'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36'
        }
def parse_kind():
    response = requests.get('http://pic.netbian.com/',headers=headers)
    response.encoding=response.apparent_encoding
    html = etree.HTML(response.text)
    kinds = html.xpath('//div[@class="classify clearfix"]//a')
    img_kinds = []
    roots = []
    for kind in kinds:
        href = kind.get('href')
        title = kind.get('title')
        url_f = 'http://pic.netbian.com'+href
        root = 'd:\\imgs\\'+title+'\\'
        os.mkdir(root)
        img_kinds.append(url_f)
        roots.append(root)
    return img_kinds,roots
    
class Producer(threading.Thread):
    headers={
            'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36'
        }
    def __init__(self,page_queue,img_queue,*args,**kwargs):
        super(Producer,self).__init__(*args,**kwargs)
        self.page_queue=page_queue
        self.img_queue=img_queue
    def run(self):
        while True:
            if self.page_queue.empty():
                break
            url = self.page_queue.get()
            self.parse_page(url)
    def parse_page(url,roots):
            response = requests.get(url,headers=self.headers)
            response.encoding=response.apparent_encoding
            html = etree.HTML(response.text)
            img_urls = html.xpath('//ul[@class="clearfix"]//a/@href')
            names = html.xpath('//ul[@class="clearfix"]//b/text()')
            for x,y,z in zip(img_urls,names,roots):
                img_url = 'http://pic.netbian.com'+x
                filename = z+y+'.jpg'
                self.img_queue.put((img_url,filename))
                
class Consumer(threading.Thread):
    def __init__(self,page_queue,img_queue,*args,**kwargs):
        super(Consumer,self).__init__(*args,**kwargs)
        self.page_queue=page_queue
        self.img_queue=img_queue
        
    def run(self):
        while True:
            if self.img_queue.empty()and self.page_queue.empty():
                break
            img_url,filename=self.img_queue.get()
            request.urlretrieve(img_url,filename)   
            print(filename+"下载完成")
            
            
def main():
    page_queue = Queue(100)
    img_queue = Queue(500)
    img_kinds,roots = parse_kind()
    for url_f in img_kinds:
        for x in range(1,101):
            if x==1:
                url = url_f
            if x!=1:
                url =url_f+'index_'+str(x)+'.html'
            page_queue.put(url)
    for x in range(5):
        t = Producer(page_queue,img_queue)
        t.start()
    for x in range(5):
        t = Consumer(page_queue,img_queue)
        t.start()
        
if __name__ == '__main__':
    main()
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值