多线程爬虫实战--爬取表情包

多线程爬虫实战–爬取表情包

1.0版本–下载表情包之同步爬虫完成
方法一
import requests
from lxml import etree
import os

def parse_page(url):
    headers={
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36'
    }
    response = requests.get(url,headers=headers)
    html = etree.HTML(response.text)
    print(html)
    imgs = html.xpath('//div[@class="page-content text-center"]//img')
    print(imgs)
    root="D:/imgs/"
    for img in imgs:
        print(etree.tostring(img))
        img_url = img.get('data-original')
        img_name = img.get('alt')
        if img_url!=None and img_name!=None:
            path = root + img_name + ".jpg"
            try:
                if not os.path.exists(root):
                    os.mkdir(root)
                if not os.path.exists(path):
                    r=requests.get(img_url)
                    with open (path,'wb') as f:
                        f.write(r.content)
                else:
                    print("文件已经存在")
            except:
                print("文件未保存成功")

def main():
    for x in range(1,101):
        url = 'http://www.doutula.com/photo/list/?page=%d'%x
        parse_page(url)
        
if __name__ == '__main__':
    main()
方法二
import requests
from lxml import etree
from urllib import request
import os
import re

def parse_page(url):
    headers={
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36'
    }
    response = requests.get(url,headers=headers)
    html = etree.HTML(response.text)
    imgs = html.xpath('//div[@class="page-content text-center"]//img')
    for img in imgs:
        img_url = img.get('data-original')
        alt = img.get('alt')
        if img_url!=None and alt!=None:
            alt = re.sub(r'[\??\.,。!!|]',' ',alt)
            suffix = os.path.splitext(img_url)[1]
            filename= 'd:\\imgs\\'+ alt + suffix
            request.urlretrieve(img_url,filename)   

def main():
    for x in range(1,101):
        url = 'http://www.doutula.com/photo/list/?page=%d'%x
        parse_page(url)
        break

if __name__ == '__main__':
    main()
2.0版本–使用生产者与消费者模式多线程下载表情包
import requests
from lxml import etree
from urllib import request
import os
import re
from queue import Queue
import threading

class Producer(threading.Thread):
    headers={
            'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36'
        }
    
    def __init__(self,page_queue,img_queue,*args,**kwargs):
        super(Producer,self).__init__(*args,**kwargs)
        self.page_queue=page_queue
        self.img_queue=img_queue
        
    def run(self):
        while True:
            if self.page_queue.empty():
                break
            url = self.page_queue.get()
            self.parse_page(url)

    def parse_page(self,url):
        response = requests.get(url,headers=self.headers)
        html = etree.HTML(response.text)
        imgs = html.xpath('//div[@class="page-content text-center"]//img')
        for img in imgs:
            img_url = img.get('data-original')
            alt = img.get('alt')
            if img_url!=None and alt!=None:
                alt = re.sub(r'[\??\.,。!!|]',' ',alt)
                suffix = os.path.splitext(img_url)[1]
                filename= 'd:\\imgs\\'+ alt + suffix
                self.img_queue.put((img_url,filename)) 
                
class Consumer(threading.Thread):
    def __init__(self,page_queue,img_queue,*args,**kwargs):
        super(Consumer,self).__init__(*args,**kwargs)
        self.page_queue=page_queue
        self.img_queue=img_queue
        
    def run(self):
        while True:
            if self.img_queue.empty()and self.page_queue.empty():
                break
            img_url,filename=self.img_queue.get()
            request.urlretrieve(img_url,filename)   
            print(filename+"下载完成")

def main():
    page_queue = Queue(100)
    img_queue =Queue(500)
    for x in range(1,101):
        url = 'http://www.doutula.com/photo/list/?page=%d'%x
        page_queue.put(url)
    for x in range(5):
        t = Producer(page_queue,img_queue)
        t.start()
    for x in range(5):
        t = Consumer(page_queue,img_queue)
        t.start()
        
if __name__ == '__main__':
    main()
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值