Python多线程爬取表情包

import requests
from lxml import etree
import threading
import queue
import time
request_url=queue.Queue()
url_queue=queue.Queue()
last_queue=queue.Queue()
mutex=threading.Lock()
url_lock=threading.Lock()
pic_downlode=threading.Lock()
for i in range(40):
    url=f'https://www.fabiaoqing.com/bqb/lists/type/liaomei/page/{i}.html'
    request_url.put(url)
#访问网站并拿到套图的url
class 请求(threading.Thread):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36'
    }
    def __init__(self,request_url,url_queue,loc):
        super(请求, self).__init__()
        self.q=request_url
        self.queue=url_queue
        self.loc=loc
    def run(self):
        while 1:
            self.loc.acquire()
            urls=self.q.get()
            self.loc.release()
            reponse = requests.get(url=urls,headers=self.headers).text
            self.parse(reponse)
            if request_url.empty():
                break
    def parse(self,response):
        tree =etree.HTML(response)
        a =tree.xpath('//div//a[@class="bqba"]/@href')
        for i in a:
            url2='https://www.fabiaoqing.com'+i
            self.loc.acquire()
            self.queue.put(url2)
            self.loc.release()
            time.sleep(0.2)
#访问套图的url拿到表情包的链接
class pic_url(threading.Thread):
    def __init__(self,queue,last_queue,loc):
        super(pic_url, self).__init__()
        self.queue=queue
        self.last_queue=last_queue
        self.loc=loc
    def run(self):
        while 1:
            self.loc.acquire()
            pic =self.queue.get()
            self.loc.release()
            headers = {
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36'
            }
            reponse =requests.get(url=pic,headers=headers).text
            tree_= etree.HTML(reponse)
            tree_ =tree_.xpath('//img[@class="bqbppdetail lazy"]/@data-original')
            for x in tree_:
                last_queue.put(x)
                time.sleep(0.5)
            if self.queue.empty():
                break
#下载表情包
class downlode(threading.Thread):
    def __init__(self,que,loc):
        super(downlode, self).__init__()
        self.que=que
        self.loc=loc
    def run(self):
        while 1:
            self.loc.acquire()
            a =self.que.get()
            self.loc.release()
            headers = {
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36'
            }
            filename='E:\\biaoqing\\' +a[-36:]
            s =requests.get(url=a,headers=headers)
            time.sleep(0.1)
            with open(filename,'wb') as a:
                a.write(s.content)
            if self.que.empty():
                break

if __name__ == '__main__':
    time.sleep(0.1)
    for mount in range(3):
        t=请求(request_url,url_queue,mutex)
        t.start()
    time.sleep(3)
    for xd in range(5):
        cd =pic_url(url_queue,last_queue,url_lock)
        cd.start()
    for sd in range(10):
        asd=downlode(last_queue,pic_downlode)
        asd.start()

这个网站有些问题,当请求次数过于频繁时,第二步获取表情包链接会获取空列表,所以需要用time.sleep

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值