爬取王者荣耀壁纸

最新推荐文章于 2023-08-09 19:52:58 发布

果、失

最新推荐文章于 2023-08-09 19:52:58 发布

阅读量264

点赞数

文章标签： python 爬虫多线程

本文链接：https://blog.csdn.net/qq_50958709/article/details/113183815

版权

1.先进入王者荣耀壁纸主页然后查看网页源代码
发现它的href没得在这里插入图片描述
2.那就是异步加载了然后点击查看点到network 可以一试下size 然后按照大小找呀找

3.就找到了对于 url 可以在这个的headers中找到哦

4.就点击到response 发现一大串是json数据就复制到这里 json解析
解析时要把前面的jQuery171030007002269320737_1611659180641(）删掉哦
然后的得到的看到 http%3A%2F%2Fshp%2Eqpic%2Ecn%2Fishow%2F2735012617%2F1611652313%5F84828260%5F14368%5FsProdImgNo%5F1%2Ejpg%2F200 一看就要解析用request 库下的 parse
在这里插入图片描述

这里是单线程

from urllib import parse
from urllib import request
import requests
import os
headers={'user-agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'}
def picture_url(i):
    urls=[]
    for index in range(1,9):
        false_url=i[f'sProdImgNo_{index}']
        ture_url=parse.unquote(false_url).replace('200','0')    #这里的替换是为了得到大像素高清的图片
        urls.append(ture_url)
    return urls

def main():
    for i in range(0,25):
        now_url=f'https://apps.game.qq.com/cgi-bin/ams/module/ishow/V1.0/query/workList_inc.cgi?activityId=2735&sVerifyCode=ABCD&sDataType=JSON&iListNum=20&totalpage=0&page={i}&iOrder=0&iSortNumClose=1&iAMSActivityId=51991&_everyRead=true&iTypeId=2&iFlowId=267733&iActId=2735&iModuleId=2735&_=1611638986357'     #通过变化  i 的 值来爬取需要的页数   这里需要把soncallback那一串删掉哦
        resp = requests.get(now_url, headers=headers)
        txt = resp.json()       #把json数据变为python数据
        text = txt['List']      #对字典    进行索引
        for j in text:
            urls = picture_url(j)
            false_name = j['sProdName']   
            try:
                try:
                    name = parse.unquote(false_name).replace('1:1', '').strip() 
                #parse.unquote  识别不了 ：等特殊符号 所以要替换   空格也要替换 用strip() 删去
                except:
                    pass
                #这里是为了 怕名字中  还有其他的特殊符号之类的  就跳过  不影响下一面的执行
                dirpath = os.path.join('wang', name)
                #os.path.join           是用来拼接名字的
                if not os.path.exists(dirpath):
                    os.mkdir(dirpath)      #用来创建文件夹    根据上面的拼接的名字
                for index,url in enumerate(urls):
                    request.urlretrieve(url,os.path.join(dirpath,f'{index+1}.jpg'))
                print(f'{name}已下载成功')
            except:
                pass
            
if __name__ == '__main__':
    main()

对于多线程我也是个搬运工加上了我一些自己的理解

from urllib import parse
from urllib import request
import requests
import os
import threading
import queue
headers={'user-agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'}

class producer(threading.Thread):         #新建一个类 是其  继承 父类 threading.Thread
    def __init__(self,page_queue,picture_queue,*args,**kwargs):
     #*args,**kwargs  这两个可以代表任意的参数  即可以代表父类中所有参数     page_queue,image_queue  这两个  意思是 给子类传入 新参进来
        super().__init__(*args,**kwargs)   #这里相当于 子类 producer 可以运用  父类的 一切属性（既可以用父类的东西)
        self.page_queue=page_queue     
        self.picture_queue=picture_queue     #传入新参后 要 定义在 类中 重新定义 赋值 才能在子类中 使用
                                              #我的理解就是相当于给函数传值一样 只不过 要经过一个过程

    def run(self) -> None:
        while not self.page_queue.empty():
            now_url=self.page_queue.get()
            resp = requests.get(now_url, headers=headers)
            txt = resp.json()
            text = txt['List']
            for j in text:
                urls = picture_url(j)
                false_name = j['sProdName']
                name = parse.unquote(false_name).replace('1:1', '').strip()
                dirpath = os.path.join('wang',name)
                if not os.path.exists(dirpath):
                    os.mkdir(dirpath)
                for index,url in enumerate(urls):
                    self.picture_queue.put({'url':url,'dirpath':os.path.join(dirpath,f'{index+1}.jpg')})
                    #把数据放入队列中  并以字典的形式   一个一个  传的   并不是  整体一起传的

class consumer(threading.Thread):
    def __init__(self,picture_queue,*args,**kwargs):
        super().__init__(*args, **kwargs)
        self.picture_queue=picture_queue

    def run(self) -> None:
        while True:
            try:
                obj = self.picture_queue.get(timeout=10)  #这里设置timeout   超出时间会报错  然后就可以退出循环
                url = obj.get('url')
                dirpath = obj.get('dirpath')
                try:                                   #防止其他的意外
                    request.urlretrieve(url, dirpath)
                    print(dirpath+'下载完成')
                except:
                    print(dirpath+'下载失败')
            except:
                break     

def picture_url(i):
    urls=[]
    for index in range(1,9):
        false_url=i[f'sProdImgNo_{index}']
        ture_url=parse.unquote(false_url).replace('200','0')   #这里的替换是为了得到大像素高清的图片
        urls.append(ture_url)
    return urls

def main():
    page_queue=queue.Queue(10)
    picture_queue=queue.Queue(1000)
    for i in range(0,10):
        now_url=f'https://apps.game.qq.com/cgi-bin/ams/module/ishow/V1.0/query/workList_inc.cgi?activityId=2735&sVerifyCode=ABCD&sDataType=JSON&iListNum=20&totalpage=0&page={i}&iOrder=0&iSortNumClose=1&iAMSActivityId=51991&_everyRead=true&iTypeId=2&iFlowId=267733&iActId=2735&iModuleId=2735&_=1611638986357'
        page_queue.put(now_url)

    for x in range(3):
        th=producer(page_queue,picture_queue,name=f'生产者{x}号')
        th.start()
    for x in range(5):
        th=consumer(picture_queue,name=f'消费者{x}号')
        th.start()

if __name__ == '__main__':
    main()