多线程爬虫

爬取美桌网站迪丽热巴的照片

首先进行网页解析:

http://www.win4000.com/mt/dilireba_1.html

多线程

import threading
from lxml import etree
from collections import deque
from pybloom_live import BloomFilter
from urllib import request
import time
 
 
class imgInfo:
    
    def __init__(self,url,title):
        self.url=url
        self.title=title
 
class clawer:
    
    def __init__(self,image_file_name):
        self.image_file_name=image_file_name
    
    request_header={
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
        'Connection': 'close'
        }     
    bfdownload=BloomFilter(1024*1024,0.01)
    cur_que=deque()
    
    def getImageUrl(self,url):
        req=request.Request(url,headers=self.request_header)
        response=request.urlopen(req)
        html_page=response.read()
        html=etree.HTML(html_page.lower().decode('utf-8'))
        img_list=html.xpath('//img[@title]')
        for img in img_list:
            if '迪丽热巴' in img.attrib['title'] and img.attrib['src'] not in self.bfdownload:
                self.cur_que.append(imgInfo(img.attrib['src'],img.attrib['title']))
                self.bfdownload.add(img.attrib['src'])        
        
    def getImge(self,url,title):
        req=request.Request(url,headers=self.request_header)
        response=request.urlopen(req)
        html_page=response.read()
        file=open(self.image_file_name+title+'.jpg','wb')
        file.write(html_page)
        file.close()
    
    
if __name__=='__main__':
    clw=clawer('C:\\Users\\Administrator\\Desktop\\图片\\')
    clw.getImageUrl('http://www.win4000.com/mt/dilireba_2.html')
    thread_pool=[]
    max_thread=10
    start=time.time()
    while True:
        try:           
            #多线程
            for t in thread_pool:
                 if not t.is_alive():
                     thread_pool.remove(t)
            if len(thread_pool)==max_thread:
                    continue   
            imgif=clw.cur_que.popleft()
            if imgif!=None:  
                thread=threading.Thread(target=clawer.getImge,name=None,args=(clw,imgif.url,imgif.title))
                thread_pool.append(thread)
                thread.setDaemon(True)
                thread.start()             
            else:
                break
            '''
            #单进程
            imgif=clw.cur_que.popleft()
            if imgif!=None: 
                clw.getImge(imgif.url,imgif.title)
            else:
                break
            '''
        except Exception as Arg:
            print(Arg)
            break
    print(time.time()-start)


参考:https://blog.csdn.net/dhaiuda/article/details/80980252#commentsedit

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值