BeautifulSoup https://www.crummy.com/software/BeautifulSoup/bs4/doc/index.zh.html#id10
步骤:
- 发送http请求,获取html
- 获取所有img标签
- 用正则表达式取出src的网址
- 返回图片网址的html,用二进制保存为图片
代码:
import urllib.request
from bs4 import BeautifulSoup
import re
picture_url = "http://www.umei.cc/"
def get_html(url):
res = urllib.request.urlopen(url)
html = res.read()
return html
def parse_html(htmlfile):
mysoup = BeautifulSoup(htmlfile,'html.parser')
picture_src = mysoup.findAll('img')
imgre = re.compile(r'src="(.*?)"')
for i in range(len(picture_src)):
s = str(picture_src[i])
img_url = re.findall(imgre,s)
if img_url:
img_url = img_url[0]
with open("G:/python/picture/%s.jpg"%(i+1),'wb') as f:
try:
res = urllib.request.urlopen(img_url)
html = res.read()
f.write(html)
except (urllib.error.URLError,ValueError) as e:
print("urllib.error.URLError:",e)
parse_html(get_html(picture_url))
效果:
用requests爬取,并用tkinter做一个简单界面
效果图:
输入网址,点击下载:
下载完成:
代码:
import requests
from bs4 import BeautifulSoup
import os
import tkinter as tk
from tkinter import scrolledtext
from tkinter.messagebox import *
def download():
url = src.get("0.0","end")
url = url.lstrip()
url = url.rstrip()
if len(url) == 0:
return -1
html = get_html(url)
src.delete("0.0","end")
parser_data(html)
def get_html(url):
if not url[:7] == "http://":
url = "http://" + url
head = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36"}
res = requests.get(url,headers=head)
return res.content
def parser_data(html):
soup = BeautifulSoup(html,"html.parser")
img = soup.find_all("img") #获取所有的img标签
size = len(img)
print(size)
try:
os.mkdir("download")
finally:
# 获取当前文件夹的文件数目,编号从之后开始,以免覆盖之前的图片
count = len(os.listdir("download"))
i = count
for x in img:
i += 1
url = x["src"]#获取src属性中的网址
print(url)
image = get_html(url)
with open("./download/%s.jpg"%i,"wb") as f:
f.write(image)
showinfo(title="提示",message="下载完成!")
mainWindow = tk.Tk()
mainWindow.minsize(650,30)
mainWindow.maxsize(650,30)
mainWindow.title("图片下载器")
font_text = "Calibri"
font_size = 15
src = tk.Text(mainWindow,font=(font_text,font_size))
src.place(x=50,y=0,width=600,height=30)
down = tk.Button(mainWindow,font=(font_text,font_size),text="下载",
bg="#2ECCFA",fg="white",command=lambda:download())
down.place(x=0,y=0,width=50,height=30)
mainWindow.mainloop()