www.souutu.com是个开放的美图网站,里面的链接比较规律,这里利用bs4来获取图片链接并下载。
注意要先 pip install BeautifulSoup4 安装好bs4模块
代码如下
# coding:utf-8
from time import sleep
import os,random,requests
from bs4 import BeautifulSoup
from tkinter import filedialog, messagebox, ttk
import tkinter as tk
class Spider(tk.Tk):
def __init__(self):
super().__init__()
self._set_window_()
self._create_body_()
def _set_window_(self):
self.title('爬图工具')
self.geometry('550x450')
self.wm_attributes('-topmost',True) #置顶
self.bind("<Escape>",lambda event:self.iconify())
def _create_body_(self):
frame1=tk.Frame(self,padx=15,pady=15)
frame1.grid(row=0,column=0, sticky='w')
frame2=tk.Frame(self,padx=15,pady=15)
frame2.grid(row=1,column=0, sticky='w')
self.num_var=tk.IntVar()
self.url_var=tk.StringVar()
self.url_var.set('https://www.souutu.com/weimei/fengjing/19825.html')
ttk.Label(frame1,text='请输入要爬取的URL:').grid(row=0,column=0,sticky='w',pady=3)
ttk.Entry(frame1,textvariable=self.url_var,width=50).grid(row=0,column=1)
ttk.Label(frame1,text='请选择要爬取的数量:').grid(row=1,column=0,sticky='w')
ttk.Spinbox(frame1, from_=10, to=400,textvariable=self.num_var,width=20).grid(row=1,column=1,sticky='w',pady=5)
ttk.Button(frame1,text='点击爬取',command=self.getImage).grid(row=2,column=0,sticky='w')
ybar=ttk.Scrollbar(frame2,orient='vertical') #竖直滚动条
self.logText = tk.Text(frame2, width=66, height=15,yscrollcommand=ybar.set) # 日志框
self.logText.grid(row=0, column=0, columnspan=3)
ybar['command']=self.logText.yview
ybar.grid(row=1,column=3,sticky='ns')
#下载图片
def getImage(self):
self.url = self.url_var.get()
self.header = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36"}
self.header["Referer"] = os.path.dirname(os.path.dirname(self.url))
self.dirname = os.path.join(os.getcwd(),'bonita') #保存图片的路径
if(not os.path.exists(self.dirname)):
os.mkdir(self.dirname)
for i in range(self.num_var.get()): #获取的次数
self.url=self.getimg(self.url,i) #获取图片
messagebox.showinfo('提示',"全部图片下载完成,请查看")
os.system('start ' + self.dirname) #打开图片的路径
def getimg(self,url,i): #链接 第几张图
res = requests.get(url, data=None, headers=self.header)
res = res.content.decode('utf-8')
soup=BeautifulSoup(res,'lxml')
try:
tag=soup.find_all('a',attrs={'title':'点击翻页'})[0]
except:
tag=soup.find_all('a',attrs={'title':'点击查看下一张'})[0]
finally:
src = tag.img["src"]
name = tag.img["alt"].split(',')[0] #定义图文件名
extension=os.path.splitext(src)[1]
imgname= os.path.join(self.dirname,"{}-{}".format(i,name) + extension)
#写文件
torrent=requests.get(src)
with open(imgname,'wb') as f:
for chunk in torrent.iter_content(10):
f.write(chunk)
sleep(random.random()*2) #休息时间为随机数
self.write_log(imgname+' 图片下载完毕')
nexturl=soup.find_all('a',attrs={'title':'下一张'})[0]['href']
return nexturl
#日志打印
def write_log(self,logmsg):
self.logText.insert('end', logmsg+"\n")
if __name__=="__main__":
app=Spider()
app.mainloop()
效果如下