只供参考,
import requests
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
ua=UserAgent()
import os
import time
class get_url():
page_url=''
path_dir='g:\st\\'
png_list=[]
manga_title=''
def mkdir(self): //创建文件夹
p=self.path_dir + self.manga_title
e=os.path.exists(p)
if not e:
print("文件夹: "+p)
os.makedirs(p)
print('新建文件夹: '+ p)
return str(p) #把文件地址丢出去
else:
print('文件夹已存在')
return str(p) #把文件地址丢出去
def headers(self):
return {'User_Agent':ua.Chrome}
def rs_get(self):
rs=requests.get(self.page_url, headers=self.headers())
bs=BeautifulSoup(rs.text,'html.parser')
#author=bs.find_all('div',class_="tag-container field-name")[1].text
title=bs.find('div',id="info").find('h1').text
self.manga_title=title
print('名字:' + self.manga_title)
data_list=bs.find_all('div',class_="thumb-container")#地址列表
for i in data_list:
href='https:'+ i.find('a')['href']#图片的地址
page=i.find('img')['alt']
l=[href, page]
self.png_list.append(l)#把地址放进去列表里用来下一步的爬取
def save_png(self):
self.rs_get()#先爬网址
dir_path=self.mkdir() #获取文件夹地址
n=0
for data in self.png_list:
n=n+1
print(data)
png_path=dir_path+'\\'+ str(n) +'.jpg'
e=os.path.exists(png_path)
if not e:
try:
//进去图片的地址,15秒超时跳出 爬下一个
rs=requests.get(data[0],headers=self.headers(),timeout=(5,10))
bs=BeautifulSoup(rs.text, 'html.parser')
png_url=bs.find('section',id="image-container").find('a').find('img')['src']
print(png_url)
png_data=requests.get(png_url,headers=self.headers())
with open(png_path,'wb') as f:
f.write(png_data.content)
print('保存文件: '+ png_path)
print('________________________________________________')
time.sleep(0.5)
except requests.exceptions.RequestException as e:
print(e)
print("超时了跳过")
else:
print("文件存在:" +png_path)
self.png_list.clear()#要清空才行
def go(self):
self.save_png()
print('完成爬取')
self.png_list.clear()#要清空才行