定义items
import scrapy
class CartoonItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
chapter = scrapy.Field()
url = scrapy.Field()
img_url = scrapy.Field()
img_path = scrapy.Field()
定义管道
通过管道获得items,使用urllib下载图片
import os
from urllib.request import urlretrieve
class CartoonPipeline:
def open_spider(self, spider):
os.mkdir('./妖神记') if not os.path.exists("./妖神记") else 1
def process_item(self, item, spider):
os.mkdir(f'./妖神记/{item["chapter"]}') if not os.path.exists(f"./妖神记/{item['chapter']}") else 1
with open(f'./妖神记/{item["chapter"]}/url.txt', 'w', encoding='utf-8') as f:
f.write(f'url:\n{item["url"]}')
for i, img_url in enumerate(item['img_url']):
urlretrieve(url=img_url,
filename=f'./妖神记/{item["chapter"]}/{i}.jpg')
print(item)
return item
spiders类
import scrapy
import re
from cartoon.items import CartoonItem
class YsjSpider(scrapy.Spider):
name = 'ysj'
allowed_domains = ['dmzj.com']
start_urls = ['https://www.dmzj.com/info/yaoshenji.html']
爬取列表页
def parse(self, response):
# 对//节点需要加个()才能取第2个
tr_list = response.xpath("(//ul[@class='list_con_li autoHeight'])[2]//li") # 1是从大到小 2是从小到大
print(len(tr_list))
for tr in tr_list:
item = CartoonItem()
# 章节名
item['chapter'] = tr.xpath("./a/@title").extract_first()
# 详情页地址
item['url'] = tr.xpath("./a/@href").extract_first()
yield scrapy.Request(
url=item['url'],
callback=self.parse_jpg,
meta={'item': item}
)
爬取漫画jpg的url
图片的地址在从源码中正则获取,然后排序放入items
def parse_jpg(self, response):
item = response.meta['item']
item['img_url'] = []
html_data = response.body.decode('utf-8') # 2进制转utf-8
# 从源码正则取
pics = re.findall(r'\d{13,14}', html_data) # 所有章节的img都在里面
# 或者
del_lists = []
for pic in pics: # 边遍历边删除需要注意删不干净
if pic[:3] == '100':
del_lists.append(pic)
for del_list in del_lists:
pics.remove(del_list)
# 排序
pics = self.sort_pics(pics)
chapterpic_hou = re.findall(r'\|(\d{5})\|', html_data)[0] # findall取第一个
chapterpic_qian = re.findall(r'\|(\d{4})\|', html_data)[0]
for pic in pics:
item['img_url'].append(f'https://images.dmzj.com/img/chapterpic/{chapterpic_qian}/{chapterpic_hou}/{pic}.jpg')
print(f'https://images.dmzj.com/img/chapterpic/{chapterpic_qian}/{chapterpic_hou}/{pic}.jpg')
yield item
def sort_pics(self, pics): # 补0后排序后去0
# for pic in pics:
# if len(pic) == 13:
# pic += '0' # 这里的改变并不会改变列表中的pic
# 通过索引在列表中改
for i, pic in enumerate(pics):
if len(pic) == 13:
pics[i] += '0'
pics = sorted(pics, key=lambda x: int(x)) # 从小到大
for i, pic in enumerate(pics):
if pic[-1] == '0':
pics[i] = pic[:-1] # 去除刚才补的0
return pics