共实现两个网站,漫画栈处于完成品状态,最近没空做了,写个博客记录一下。
第一步,导入库:
import json
import os
from time import sleep
import requests
from lxml import etree
import re
from RandomUAMiddleware import RandomUAMiddleware
from imgWebInfo import lenON, imgWebInfo
同样的,定义一个获取网页的函数:
def getHtml(url):
try:
h = RandomUAMiddleware()
header = {
'User-Agent': h.Agent
}
r = requests.get(url, headers=header)
# 解决解析乱码问题
r.raise_for_status()
r.encoding = r.apparent_encoding
# sleep(5)
# print(r.text)
return r
except Exception as e:
print(e)
return ""
获取页面所有漫画图片链接:
def getImgUrl(html, listUrl, id_2):
x = imgWebInfo(id_2)
xhtml = etree.HTML(html.content.decode(x.charset)) # 解析网页
if x.flag3 == 0:
scriptInfo = str(xhtml.xpath(x.imgScript))
scriptInfo = scriptInfo.replace(';', '\n')
imgUrl = re.findall(x.imgUrl, scriptInfo)
path = re.findall(x.imgPath, scriptInfo)
for elm in imgUrl:
listUrl.append(elm)
return path[0]
else:
pageTitle = xhtml.xpath(x.pageTitle)
# print(pageTitle)
# id_cartoon = pageUrl[0].replace('/', '')
# print(id_cartoon)
# id_page = pageTitle[0].replace(id_cartoon, '')
# id_page = id_page.replace('/', '')
# id_page = id_page.replace('.html', '')
# print(id_page)
l = re.findall(r'\d+', pageTitle[0])
# print(l)
imgJson = x.getJson.replace('[\']', l[1])
imgJson = imgJson.replace('[/]', l[0])
# print(imgJson)
jsonHtml = getHtml(imgJson)
print(jsonHtml.text)
try:
loadJosn = json.loads(jsonHtml.text, strict=False)
for u in loadJosn['data']['page']:
listUrl.append(u['image'])
except:
print("获取图片失败")
return ''
通过图片地址列表下载图片:
def downLoad(urlList, minPath, rootPath, id_1):
# print(urlList)
x = imgWebInfo(id_1)
if x.fl