多线程爬虫实战--彼岸图网壁纸爬取

最新推荐文章于 2024-05-01 21:53:18 发布

璐南熙

最新推荐文章于 2024-05-01 21:53:18 发布

阅读量479

点赞数

分类专栏：爬虫

本文链接：https://blog.csdn.net/big_data_vicky/article/details/117959095

版权

爬虫专栏收录该内容

8 篇文章 0 订阅

订阅专栏

多线程爬虫实战–彼岸图网壁纸爬取

普通方法爬取

import requests
from lxml import etree
import os
from urllib import request

headers={
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36'
    }
def parse_kind():
    response = requests.get('http://pic.netbian.com/',headers=headers)
    response.encoding=response.apparent_encoding
    html = etree.HTML(response.text)
    kinds = html.xpath('//div[@class="classify clearfix"]//a')
    kind_list = []
    for kind in kinds:
        href = kind.get('href')
        title = kind.get('title')
        kind_list.append([title,href])
    return kind_list
def parse_page(kind_list):
    for i in kind_list:
        url_f = 'http://pic.netbian.com'+i[1]
        root = 'd:\\imgs\\'+i[0]+'\\'
        os.mkdir(root)
        for x in range(1,101):
            if x==1:
                url = url_f
            if x!=1:
                url =url_f+'index_'+str(x)+'.html'
            #print(url)
            response = requests.get(url,headers=headers)
            response.encoding=response.apparent_encoding
            #print(response.text)
            html = etree.HTML(response.text)
            img_u = html.xpath('//ul[@class="clearfix"]//a/@href')
            for x in img_u:
                r = requests.get('http://pic.netbian.com'+x,headers=headers)
                r.encoding=r.apparent_encoding
                h = etree.HTML(r.text)
                img_urls =  h.xpath('//div[@class="photo-pic"]//img')
                for i in img_urls:
                    img_url=i.get('src')
                    img_name =i.get('alt')
                    request.urlretrieve('http://pic.netbian.com'+img_url,root+img_name+'.jpg')
                    
                        

def main():
        kind_list=parse_kind()
        parse_page(kind_list)

if __name__ == '__main__':
    main()

多线程方法爬取

import requests
from lxml import etree
import os
from urllib import request
from queue import Queue
import threading

headers={
            'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36'
        }
def parse_kind():
    response = requests.get('http://pic.netbian.com/',headers=headers)
    response.encoding=response.apparent_encoding
    html = etree.HTML(response.text)
    kinds = html.xpath('//div[@class="classify clearfix"]//a')
    img_kinds = []
    roots = []
    for kind in kinds:
        href = kind.get('href')
        title = kind.get('title')
        url_f = 'http://pic.netbian.com'+href
        root = 'd:\\imgs\\'+title+'\\'
        os.mkdir(root)
        img_kinds.append(url_f)
        roots.append(root)
    return img_kinds,roots
    
class Producer(threading.Thread):
    headers={
            'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36'
        }
    def __init__(self,page_queue,img_queue,*args,**kwargs):
        super(Producer,self).__init__(*args,**kwargs)
        self.page_queue=page_queue
        self.img_queue=img_queue
    def run(self):
        while True:
            if self.page_queue.empty():
                break
            url = self.page_queue.get()
            self.parse_page(url)
    def parse_page(url,roots):
            response = requests.get(url,headers=self.headers)
            response.encoding=response.apparent_encoding
            html = etree.HTML(response.text)
            img_urls = html.xpath('//ul[@class="clearfix"]//a/@href')
            names = html.xpath('//ul[@class="clearfix"]//b/text()')
            for x,y,z in zip(img_urls,names,roots):
                img_url = 'http://pic.netbian.com'+x
                filename = z+y+'.jpg'
                self.img_queue.put((img_url,filename))
                
class Consumer(threading.Thread):
    def __init__(self,page_queue,img_queue,*args,**kwargs):
        super(Consumer,self).__init__(*args,**kwargs)
        self.page_queue=page_queue
        self.img_queue=img_queue
        
    def run(self):
        while True:
            if self.img_queue.empty()and self.page_queue.empty():
                break
            img_url,filename=self.img_queue.get()
            request.urlretrieve(img_url,filename)   
            print(filename+"下载完成")
            
            
def main():
    page_queue = Queue(100)
    img_queue = Queue(500)
    img_kinds,roots = parse_kind()
    for url_f in img_kinds:
        for x in range(1,101):
            if x==1:
                url = url_f
            if x!=1:
                url =url_f+'index_'+str(x)+'.html'
            page_queue.put(url)
    for x in range(5):
        t = Producer(page_queue,img_queue)
        t.start()
    for x in range(5):
        t = Consumer(page_queue,img_queue)
        t.start()
        
if __name__ == '__main__':
    main()

璐南熙

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
多线程爬虫实战--彼岸图网壁纸爬取

多线程爬虫实战–彼岸图网壁纸爬取普通方法爬取import requestsfrom lxml import etreeimport osfrom urllib import requestheaders={ 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36' }def pa
复制链接

扫一扫

专栏目录