scrapy爬取王者官网壁纸-CSDN博客

本文链接：https://blog.csdn.net/qq_31247149/article/details/131595738

1.安装scrapy

pip install scrapy
pip install Pillow

2.创建项目

scrapy startproject wangzhe
cd .\wangzhe\
scrapy genspider wz xxx

3.修改wz.py

import os
import re
import scrapy
import urllib.parse
from ..items import *


# 手动保存
class WzSpider(scrapy.Spider):
    name = "wz"
    # allowed_domains = ["xxx"]
    index = 0
    start_urls = [
        "https://apps.game.qq.com/cgi-bin/ams/module/ishow/V1.0/query/workList_inc.cgi?activityId=2735&sVerifyCode"
        "=ABCD&sDataType=JSON&iListNum=20&totalpage=0&page=0&iOrder=0&iSortNumClose=1&jsoncallback"
        "=jQuery111307884248345384934_1688708592927&iAMSActivityId=51991&_everyRead=true&iTypeId=2&iFlowId=267733"
        "&iActId=2735&iModuleId=2735&_=1688708592929"]

    def parse(self, response):
        item = ImgItem()
        findall = re.findall('"sProdImgNo_[\d]":"(.*?)"', response.text)
        index = 0
        for i in findall:
            findall[index] = urllib.parse.unquote(i)
            print(findall[index])
            index += 1
        item['image_urls'] = findall
        yield item

4.修改item.py


import scrapy


class ImgItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    image_urls = scrapy.Field()

5.修改pipelines.py

import os
from .settings import IMAGES_STORE
# useful for handling different item types with a single interface
from itemadapter import ItemAdapter

from scrapy.pipelines.images import ImagesPipeline


class ImgPipeline(ImagesPipeline):
    image_num = 0

    def item_completed(self, results, item, info):
        for ok, x in results:
            if ok:
                print(x["path"])
            image_paths = [x["path"] for ok, x in results if ok]
            for path in image_paths:
                os.rename(os.path.join(IMAGES_STORE, path), os.path.join(IMAGES_STORE, str(self.image_num) + '.png'))
                self.image_num += 1

6.修改settings.py

BOT_NAME = "wangzhe"

SPIDER_MODULES = ["wangzhe.spiders"]
NEWSPIDER_MODULE = "wangzhe.spiders"


ROBOTSTXT_OBEY = False


DEFAULT_REQUEST_HEADERS = {
   "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
   "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 "
                  "Safari/537.36",
}

ITEM_PIPELINES = {
   "ITEM_PIPELINES = {
    "wangzhe.pipelines.ImgPipeline": 300,
}
.pipelines.ImgPipeline": 300,
}

IMAGES_STORE = "./wang/"


LOG_FILE = 'wangzhe.log'
LOG_FORMAT = '%(asctime)s--%(name)s--%(levelname)s: %(message)s'
LOG_DATEFORMAT = "%Y"

7.开始爬取

scrapy crawl wz