1.安装scrapy
pip install scrapy
pip install Pillow
2.创建项目
scrapy startproject wangzhe
cd .\wangzhe\
scrapy genspider wz xxx
3.修改wz.py
import os
import re
import scrapy
import urllib.parse
from ..items import *
# 手动保存
class WzSpider(scrapy.Spider):
name = "wz"
# allowed_domains = ["xxx"]
index = 0
start_urls = [
"https://apps.game.qq.com/cgi-bin/ams/module/ishow/V1.0/query/workList_inc.cgi?activityId=2735&sVerifyCode"
"=ABCD&sDataType=JSON&iListNum=20&totalpage=0&page=0&iOrder=0&iSortNumClose=1&jsoncallback"
"=jQuery111307884248345384934_1688708592927&iAMSActivityId=51991&_everyRead=true&iTypeId=2&iFlowId=267733"
"&iActId=2735&iModuleId=2735&_=1688708592929"]
def parse(self, response):
item = ImgItem()
findall = re.findall('"sProdImgNo_[\d]":"(.*?)"', response.text)
index = 0
for i in findall:
findall[index] = urllib.parse.unquote(i)
print(findall[index])
index += 1
item['image_urls'] = findall
yield item
4.修改item.py
import scrapy
class ImgItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
image_urls = scrapy.Field()
5.修改pipelines.py
import os
from .settings import IMAGES_STORE
# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
from scrapy.pipelines.images import ImagesPipeline
class ImgPipeline(ImagesPipeline):
image_num = 0
def item_completed(self, results, item, info):
for ok, x in results:
if ok:
print(x["path"])
image_paths = [x["path"] for ok, x in results if ok]
for path in image_paths:
os.rename(os.path.join(IMAGES_STORE, path), os.path.join(IMAGES_STORE, str(self.image_num) + '.png'))
self.image_num += 1
6.修改settings.py
BOT_NAME = "wangzhe"
SPIDER_MODULES = ["wangzhe.spiders"]
NEWSPIDER_MODULE = "wangzhe.spiders"
ROBOTSTXT_OBEY = False
DEFAULT_REQUEST_HEADERS = {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 "
"Safari/537.36",
}
ITEM_PIPELINES = {
"ITEM_PIPELINES = {
"wangzhe.pipelines.ImgPipeline": 300,
}
.pipelines.ImgPipeline": 300,
}
IMAGES_STORE = "./wang/"
LOG_FILE = 'wangzhe.log'
LOG_FORMAT = '%(asctime)s--%(name)s--%(levelname)s: %(message)s'
LOG_DATEFORMAT = "%Y"
7.开始爬取
scrapy crawl wz