下面是源码
from urllib import request
import requests, re
import time
import random
import socket
def mkdir(path):
# 引入模块
import os
# 去除首位空格
path = path.strip()
# 去除尾部 \ 符号
path = path.rstrip("\\")
# 判断路径是否存在
# 存在 True
# 不存在 False
isExists = os.path.exists(path)
# 判断结果
if not isExists:
# 如果不存在则创建目录
# 创建目录操作函数
os.makedirs(path)
print(
path + ' 创建成功')
return True
else:
# 如果目录存在则不创建,并提示目录已存在
print(
path + ' 目录已存在')
return False
bese_url = "https://manhua.fzdm.com/41/"
head = {
"cookie": "picHost=p17.xiaoshidi.net; Hm_lvt_cb51090e9c10cda176f81a7fa92c3dfc=1545054252,1545054291,1545054381,1545054404; Hm_lpvt_cb51090e9c10cda176f81a7fa92c3dfc=1545054475",
"user-agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36"
}
r = requests.get(bese_url, headers=head)#打开网页
juan = r.text#查看网页源码
# print(juan)
juanm = re.findall('<a href="(.+?)"', juan)#卷名地址正则爬取迟来
# print(juanm)
juanm1 = juanm[13:-3]#去掉不要的卷名与内容不符的列表内元素
list.reverse(juanm1)#因为地址倒序,所以排一下
print(juanm1)
#juanm1 =juanm1[43:]
# print(len(juanm1))
print(juanm1)
zhangshu = 0
bese_ur2 = "https://manhua.fzdm.com/41/{}"#隐藏的漫画图片网址!!!!
for i in juanm1:
print(i)
zhangshu += 1
zhangshu1 = "./亚人/{}".format(i)#存放的文件夹路径以及文件夹名字
mkdir(zhangshu1)#调用函数,创建文件夹
bese_url3 = bese_ur2.format(i)#漫画地址(每话的)
# print(bese_url3)
a = "index_{}.html"
#遍历一话多少页
for i in range(0, 1000):
bese_url4 = bese_url3 + a.format(i)
# print(bese_url4)
nr = requests.get(bese_url4, headers=head, timeout=2)
#打开每个图片的网页内容
nr1 = nr.text
# print(nr1)
tupian = re.findall('var mhurl="(.*?)"', nr1)#正则爬取出每个图片的后面的地址
print(tupian)
tupian1 = ""
for n in tupian:
tupian1 += n
print(tupian1)
tupiandizhi = "http://p0.xiaoshidi.net/{}"
tupiandizhi1 = tupiandizhi.format(tupian1)#组合图片地址
print(tupiandizhi1)
i = str(i)
yeshu1 = '第{a}页{b}'.format(a=i,b=tupian1[-8:])
yeshu2 = '{b}'.format(b=tupian1[-6:])
print(yeshu1[-3:])#判断是什么格式格式图片
if tupian1[-3:] == 'jpg':
zhangshu2 = '{}'.format(i)
try:
request.urlretrieve(tupiandizhi1, zhangshu1 +yeshu1)#判断正确 写入
except:
print(tupian1+"没获取到")
elif tupian1[-3:] == 'png':#判断是不是png格式图片
zhangshu2 = '{}'.format(i)
try:
request.urlretrieve(tupiandizhi1, zhangshu1 +yeshu2)
except:
print(tupian1+"没获取到")
else:#如果不是png|jpg格式,则跳出这话动漫
print("最后一页")
break
#时间随机等待,表现的更像人类
shuzi = [1, 2, 3, 4, 5]
shuzishij = random.randint(0, 4)
print(shuzishij)
time.sleep(shuzi[shuzishij])
# except:
# print("这个没获取到")
time.sleep(60)
代码写的比较乱,看不懂的可以留言问哈。新手请多多指教。谢谢