import os
import requests
from bs4 import BeautifulSoup
import re
import urllib
import time
header ={
'Referer':'http://www.kuaikanmanhua.com/','User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36'}# 访问网站的头,避免被认为爬虫的基本操作dir="E:\\download\\pengran\\"# 某个漫画的第一话内容,或者说是你要爬取的起始那话的链接
url ="https://www.kuaikanmanhua.com/web/comic/10950/"# url = "https://www.kuaikanmanhua.com/web/comic/157885/"# 爬取的漫画网站网址,作为拼接时使用
half_url ="https://www.kuaikanmanhua.com"# 全局变量,保证自己知道
n =1
s = requests.session()
s.headers = header
# 获取图片的链接,此处的函数时获取网站图片的链接,因为设置到查找链接的条件,所以找到的链接全部都是本url的漫画图,返回的是一个图片的列表(数组)defget_imageurl(url):
a =[]global s
html = s.get(url).text
soup = BeautifulSoup(html,'html.parser')
img_links = soup.select('.kklazy')for img_link in img_links:
a.append(img_link['data-kksrc'])return a
# 获取下一话的一半网址,因为要和该网站的网址进行拼接才可以访问。基本上就是通过正则表达式找到下一话对应的链接defget_next(url):next=""
con = requests.get(url)
content = BeautifulSoup(con.content,"lxml")
li = content.find_all("ul", class_="clearfix")for i inrange(len(li)):if i ==1:
a =str(li[i