多线程爬取小米商店应用（提高爬取效率）

最新推荐文章于 2021-03-12 19:19:22 发布

dongxieaitonglao

最新推荐文章于 2021-03-12 19:19:22 发布

阅读量477

点赞数

本文链接：https://blog.csdn.net/dongxieaitonglao/article/details/106747194

版权

import requests
import time
from threading import Thread
from queue import Queue
import json

class XiaomiSpider(object,):

    def __init__(self):
        self.url='http://app.mi.com/categotyAllListApi?page={}&categoryId=2&pageSize=30'
        self.headers={'User-Agent':'Mozilla/5.0'}
        #url队列
        self.url_queue=Queue()
    #URL入队列
    def url_in(self):
        for i in range(67):
            url=self.url.format(i)
            #入队列
            self.url_queue.put(url)
    #线程事件函数
    def get_data(self):
        while True:
            #self.url_queue.empty()为true,队列为空
            if not self.url_queue.empty():
                #get 地址， 请求+解析+保存
                url=self.url_queue.get()
                html=requests.get(
                    url=url,
                    headers=self.headers,
                ).content.decode('utf-8')
                #把json格式的数据转换成python类型的数据，这里是字典
                html=json.loads(html)
                #解析数据: html['data']->[{},{},{}]
                for app in html['data']:
                    #应用名称
                    app_name=app['displayName']
                    app_link='http://app.mi.com/details?id=' \
                                            + app['packageName']
                    print(app_name)


    #主函数
    def main(self):
        #url入队列
        self.url_in()
        #创建多线程
        t_list=[]
        for i in range(5):
            t=Thread(target=self.get_data)
            t_list.append(t)
            t.start()

        for i in t_list:
            i.join()

        print('应用数量:',self.n)

if __name__=='__main__':
    start=time.time()
    spider=XiaomiSpider()
    spider.main()
    end=time.time()
    print('执行时间:%.2f' % (end-start))