在阅读爬虫-漫画喵的100行逆袭 的代码后,观察龙族三漫画图片地址规律,简写专门下载龙族三漫画的小爬虫。
#coding=utf-8
import os
import urllib2
#简单下载器
def download(url, save_path):
try:
with open(save_path, 'wb') as fp:
fp.write(urllib2.urlopen(url).read())
except Exception, et:
print(et)
#定义存储目录
save_folder = ".\longzu"
if not os.path.exists(save_folder):
os.mkdir(save_folder)
#漫画图片链接,可通过格式化字符串的方式获取新的图片链接
url = 'http://mhpic.zymk.cn/comic/L%2F%E9%BE%99%E6%97%8F%E2%85%A2%2F{0}%E8%AF%9D%2F{1}.jpg-mht.middle'
#初始章节
chapter = 1
#循环下载章节
while chapter < 47:
#生成章节目录
folder = os.path.join(save_folder,u"第 %d 话"%chapter)
if not os.path.exists(folder):
os.mkdir(folder)
index = 1
while True:
image_url = url.format('%.2d'%chapter,index)
save_image_name = os.path.join(folder,"%.2d"%index+"."+"jpg")
#判断是否到达章节最后一页
try:
tmp = urllib2.urlopen(image_url).read()
except urllib2.HTTPError:
break
print "downloading:%s from url:%s" % (save_image_name,image_url)
download(image_url, save_image_name)
index += 1
chapter += 1