scrapy爬取王者官网壁纸

1.安装scrapy

pip install scrapy
pip install Pillow 

2.创建项目

scrapy startproject wangzhe
cd .\wangzhe\
scrapy genspider wz xxx

3.修改wz.py

import os
import re
import scrapy
import urllib.parse
from ..items import *


# 手动保存
class WzSpider(scrapy.Spider):
    name = "wz"
    # allowed_domains = ["xxx"]
    index = 0
    start_urls = [
        "https://apps.game.qq.com/cgi-bin/ams/module/ishow/V1.0/query/workList_inc.cgi?activityId=2735&sVerifyCode"
        "=ABCD&sDataType=JSON&iListNum=20&totalpage=0&page=0&iOrder=0&iSortNumClose=1&jsoncallback"
        "=jQuery111307884248345384934_1688708592927&iAMSActivityId=51991&_everyRead=true&iTypeId=2&iFlowId=267733"
        "&iActId=2735&iModuleId=2735&_=1688708592929"]

    def parse(self, response):
        item = ImgItem()
        findall = re.findall('"sProdImgNo_[\d]":"(.*?)"', response.text)
        index = 0
        for i in findall:
            findall[index] = urllib.parse.unquote(i)
            print(findall[index])
            index += 1
        item['image_urls'] = findall
        yield item

4.修改item.py


import scrapy


class ImgItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    image_urls = scrapy.Field()

5.修改pipelines.py

import os
from .settings import IMAGES_STORE
# useful for handling different item types with a single interface
from itemadapter import ItemAdapter

from scrapy.pipelines.images import ImagesPipeline


class ImgPipeline(ImagesPipeline):
    image_num = 0

    def item_completed(self, results, item, info):
        for ok, x in results:
            if ok:
                print(x["path"])
            image_paths = [x["path"] for ok, x in results if ok]
            for path in image_paths:
                os.rename(os.path.join(IMAGES_STORE, path), os.path.join(IMAGES_STORE, str(self.image_num) + '.png'))
                self.image_num += 1

6.修改settings.py
 

BOT_NAME = "wangzhe"

SPIDER_MODULES = ["wangzhe.spiders"]
NEWSPIDER_MODULE = "wangzhe.spiders"


ROBOTSTXT_OBEY = False


DEFAULT_REQUEST_HEADERS = {
   "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
   "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 "
                  "Safari/537.36",
}

ITEM_PIPELINES = {
   "ITEM_PIPELINES = {
    "wangzhe.pipelines.ImgPipeline": 300,
}
.pipelines.ImgPipeline": 300,
}

IMAGES_STORE = "./wang/"


LOG_FILE = 'wangzhe.log'
LOG_FORMAT = '%(asctime)s--%(name)s--%(levelname)s: %(message)s'
LOG_DATEFORMAT = "%Y"

7.开始爬取

scrapy crawl wz

             

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值