Python多线程爬取表情包

最新推荐文章于 2022-07-31 23:56:48 发布

我的目标吉良吉影

最新推荐文章于 2022-07-31 23:56:48 发布

阅读量190

点赞数

分类专栏：爬虫线程文章标签：多线程

本文链接：https://blog.csdn.net/snake_boy_/article/details/108506604

版权

爬虫同时被 2 个专栏收录

2 篇文章 0 订阅

订阅专栏

线程

2 篇文章 0 订阅

订阅专栏

import requests
from lxml import etree
import threading
import queue
import time
request_url=queue.Queue()
url_queue=queue.Queue()
last_queue=queue.Queue()
mutex=threading.Lock()
url_lock=threading.Lock()
pic_downlode=threading.Lock()
for i in range(40):
    url=f'https://www.fabiaoqing.com/bqb/lists/type/liaomei/page/{i}.html'
    request_url.put(url)
#访问网站并拿到套图的url
class 请求(threading.Thread):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36'
    }
    def __init__(self,request_url,url_queue,loc):
        super(请求, self).__init__()
        self.q=request_url
        self.queue=url_queue
        self.loc=loc
    def run(self):
        while 1:
            self.loc.acquire()
            urls=self.q.get()
            self.loc.release()
            reponse = requests.get(url=urls,headers=self.headers).text
            self.parse(reponse)
            if request_url.empty():
                break
    def parse(self,response):
        tree =etree.HTML(response)
        a =tree.xpath('//div//a[@class="bqba"]/@href')
        for i in a:
            url2='https://www.fabiaoqing.com'+i
            self.loc.acquire()
            self.queue.put(url2)
            self.loc.release()
            time.sleep(0.2)
#访问套图的url拿到表情包的链接
class pic_url(threading.Thread):
    def __init__(self,queue,last_queue,loc):
        super(pic_url, self).__init__()
        self.queue=queue
        self.last_queue=last_queue
        self.loc=loc
    def run(self):
        while 1:
            self.loc.acquire()
            pic =self.queue.get()
            self.loc.release()
            headers = {
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36'
            }
            reponse =requests.get(url=pic,headers=headers).text
            tree_= etree.HTML(reponse)
            tree_ =tree_.xpath('//img[@class="bqbppdetail lazy"]/@data-original')
            for x in tree_:
                last_queue.put(x)
                time.sleep(0.5)
            if self.queue.empty():
                break
#下载表情包
class downlode(threading.Thread):
    def __init__(self,que,loc):
        super(downlode, self).__init__()
        self.que=que
        self.loc=loc
    def run(self):
        while 1:
            self.loc.acquire()
            a =self.que.get()
            self.loc.release()
            headers = {
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36'
            }
            filename='E:\\biaoqing\\' +a[-36:]
            s =requests.get(url=a,headers=headers)
            time.sleep(0.1)
            with open(filename,'wb') as a:
                a.write(s.content)
            if self.que.empty():
                break

if __name__ == '__main__':
    time.sleep(0.1)
    for mount in range(3):
        t=请求(request_url,url_queue,mutex)
        t.start()
    time.sleep(3)
    for xd in range(5):
        cd =pic_url(url_queue,last_queue,url_lock)
        cd.start()
    for sd in range(10):
        asd=downlode(last_queue,pic_downlode)
        asd.start()

这个网站有些问题，当请求次数过于频繁时，第二步获取表情包链接会获取空列表，所以需要用time.sleep

我的目标吉良吉影

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
Python多线程爬取表情包

import requestsfrom lxml import etreeimport threadingimport queueimport timerequest_url=queue.Queue()url_queue=queue.Queue()last_queue=queue.Queue()mutex=threading.Lock()url_lock=threading.Lock()pic_downlode=threading.Lock()for i in range(40):.
复制链接

扫一扫