使用pythonFlask展示爬虫后的图片
上次,写了一个简单爬取全站的图片,但是效果不是很直观,所以便想着直观的展示一下爬虫后的结果,刚好看到了flask,学习一下,做个图片展示的demo。
Flask是一个基于Python语言的微型web框架,核心非常的小,简约而不简单,具有很强的扩展能力。
另外前端不是很熟悉,便引用了一下bootstrap4,使用现成的组件。
需要用flask的知识点:
所有的flask程序都必须创建一个程序实例,web服务器使用一种名叫web服务器网关接口的协议,把接收自客户端的所有请求都转给这个对象处理,程序实例是flask类的对象由下面代码创建
from flask import Flask
app = Flask(__name__)
@app.route(‘/’)指定url与python函数的映射关系,也简称路由
render_template()函数来实现模板的渲染
处理函数:
@app.route("/",methods=("GET", "POST"))
def index():
placeholder1 = '请输入需要爬取的网页地址'
placeholder2 = '请输入筛选域名'
if request.method == "GET":
return render_template('index.html',**locals())
if request.method == "POST":
user_info = request.values.to_dict()
url = user_info.get("url")
host = user_info.get("host")
placeholder1 = url
placeholder2 = host
pagenum, imgnum, imageset = jiandangzhuqu.get_href(host, url)
return render_template('index.html',**locals())
Get请求就直接返回index页面
Post请求获取参数,进行爬虫,拿到结果展示
全部代码展示,有兴趣的朋友可以直接拷贝代码试一试:
学习案例,没有写分页,有兴趣的可以加上。
App.py
from flask import Flask
from server import jiandangzhuqu
from flask import render_template, request
app = Flask(__name__)
@app.route("/",methods=("GET", "POST"))
def index():
placeholder1 = '请输入需要爬取的网页地址'
placeholder2 = '请输入筛选域名'
if request.method == "GET":
return render_template('index.html',**locals())
if request.method == "POST":
user_info = request.values.to_dict()
url = user_info.get("url")
host = user_info.get("host")
placeholder1 = url
placeholder2 = host
pagenum, imgnum, imageset = jiandangzhuqu.get_href(host, url)
return render_template('index.html',**locals())
@app.route("/idc/crawling/<host>/<path:url>")
def idcpachong(url,host):
pagenum,imgnum,imageset = jiandangzhuqu.get_href(host,url)
print("抓取网页个数:%d"%(pagenum),"抓取图片数:%d"%(imgnum))
print("hello world! %s : %s " % (url,host))
return str(imageset)
#app.add_url_rule()
# @app.route('/execute',methods=("GET", "POST"))
# def execute(self):
# url = self
if __name__== '__main__':
app.run(debug=True,port=9019)
#http://127.0.0.1:9019/idc/crawling/csdn/https://www.csdn.net
index.html
<!doctype html>
<html lang="en">
<head>
<!-- Required meta tags -->
<meta charset="utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
<!-- Bootstrap CSS -->
<link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/bootstrap@4.5.0/dist/css/bootstrap.min.css"
integrity="sha384-9aIt2nRpC12Uk9gS9baDl411NQApFmC26EwAOH8WgZl5MYYxFfc+NcPb1dKGj7Sk" crossorigin="anonymous">
<title>Simpleness reptile</title>
<style>
.col-center-block {
margin-top: 50px;
}
.form-row.align-items-center{
}
.form-control.mb-2{
width: 300px;
}
.content{
width:100%
height: auto;
}
.clearfix:after {
display: block;
visibility: hidden;
clear: both;
font-size: 0;
height: 0;
content: '020';
}
.list-group-item{
overflow:hidden;
height: 200px;
}
<!--.imgtu{-->
<!-- width:auto;-->
<!-- height:auto;-->
<!--}-->
.list-group-item img{
border:0;
margin:0;
padding:0;
max-width:200px;
width:expression(this.width>200?"200px":this.width);
<!-- max-height:200px;-->
height:expression(this.height>250?"250px":this.height);
}
</style>
</head>
<body>
<form action="#" method="post">
<div class="col-center-block form-row justify-content-center">
<div class="col-auto align-items-center">
<label class="sr-only" for="inlineFormInput">Name</label>
<input type="text" class="form-control mb-2" name="url" id="inlineFormInput" placeholder={{placeholder1}}>
</div>
<div class="col-auto">
<label class="sr-only" for="inlineFormInputGroup">Username</label>
<div class="input-group mb-2">
<input type="text" class="form-control" name="host" id="inlineFormInputGroup" placeholder={{placeholder2}}>
</div>
</div>
<div class="col-auto">
<button type="submit" class="btn btn-primary mb-2">爬取</button>
</div>
</div>
</form>
<div class="content">
<!-- <ul class="imglist clearfix pageNum0" style="position: relative;">-->
<ul class="list-group list-group-horizontal-xl justify-content-center" style="display:flex;width:100%;flex-wrap:wrap;">
{% for img in imageset%}
<li class="list-group-item" style="width:250px;heigth:250px" ><img class="imgtu" src="{{img}}" alt="" ></li>
{% endfor %}
</ul>
</div>
<!-- Optional JavaScript -->
<!-- jQuery first, then Popper.js, then Bootstrap JS -->
<script src="https://cdn.jsdelivr.net/npm/jquery@3.5.1/dist/jquery.slim.min.js"
integrity="sha384-DfXdz2htPH0lsSSs5nCTpuj/zy4C+OGpamoFVy38MVBnE+IbbVYUew+OrCXaRkfj"
crossorigin="anonymous"></script>
<script src="https://cdn.jsdelivr.net/npm/popper.js@1.16.0/dist/umd/popper.min.js"
integrity="sha384-Q6E9RHvbIyZFJoft+2mJbHaEWldlvI9IOYy5n3zV9zzTtmI3UksdQRVvoxMfooAo"
crossorigin="anonymous"></script>
<script src="https://cdn.jsdelivr.net/npm/bootstrap@4.5.0/dist/js/bootstrap.min.js"
integrity="sha384-OgVRvuATP1z7JjHLkuOU7Xw704+h835Lr+6QL9UvYjZE3Ipu6Tp75j7Bh/kR0JKI"
crossorigin="anonymous"></script>
</body>
</html>
Jiandangzhuqu.py爬虫方法
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import urllib.request
import requests
import os,re
import urllib
def get_html(url):
try:
# page = urllib.request.urlopen(url)
# html_a = page.read()
response = requests.get(url)
response.encoding = 'utf-8'
html_a = response.text
except Exception as e:
return None
# return html_a.decode('utf-8')
return html_a
def get_href(host,url):
href = r'href=\"[^\s]*?\"'
# href = r'href=\"[^\s]*?\s'
hrefe = re.compile(href)
html = get_html(url)
hreflist = hrefe.findall(html)
hreflist = list(set(hreflist)) # 网页url去重
#本网页图片下载
hreflist.append("href=\""+url+"\"")
pagenum = 0 #爬取网页数
imgnum = 0 #爬取图片数
#图片集合
imageset = set()
for heurl in hreflist:
# 判断是否是该网站的网页
if len(host) == 0 or host in heurl:
print(heurl)
try:
heurl = eval(heurl.strip("href="))
except Exception as e:
print('Error:', e)
continue
body = get_html(heurl)
if body is not None:
pagenum= pagenum + 1
get_img(body,imageset)
# else:
# get_img(body)
# imgnum = img_download(imageset)
imgnum = len(imageset)
return pagenum,imgnum,imageset
#图片下载
def img_download(imglist):
x = 1
path = 'D:\\img' # 设置图片的保存地址
if not os.path.isdir(path):
os.makedirs(path) # 判断没有此路径则创建
paths = path + '\\' # 保存在test路径下
for imgurl in imglist:
# imgurl = eval(imgurl+'\"')
# if 'http' in imgurl:
# ina = 1
# else:
# if imgurl.startswith('//'):
# imgurl = 'http:' + imgurl
# else:
# imgurl = 'http://'+imgurl
try:
urllib.request.urlretrieve(imgurl, '{0}{1}.png'.format(paths, x)) # 打开imgList,下载图片到本地
x = x + 1
print('图片%d开始下载,注意查看文件夹'%(x))
except Exception as e:
print('Error:', e)
return x
def get_img(html,imageset):
# reg = r'https://[^\s]*?\.png'
reg = r'http[^\s]*?\.(?:jpg|png|jpeg)'
imgre = re.compile(reg) # 转换成一个正则对象
imglist = imgre.findall(html) # 表示在整个网页过滤出所有图片的地址,放在imgList中
imglist=list(set(imglist)) #图片url去重
for img in imglist:
imageset.add(img)
if __name__== '__main__':
#本网页抓取图片数据
#html_b = get_html("https://ent.china.com/") # 获取该网页的详细信息
#print(get_img(html_b)) # 从网页源代码中分析下载保存图片
#部分全站抓取图片数据
pagenum,imgnum,imageset = get_href("csdn","https://www.csdn.net/")
print("抓取网页个数:%d"%(pagenum),"抓取图片数:%d"%(imgnum))
运行效果:
参考书籍:
Python Flask Web开发入门与项目实战
欢迎转载,请标注出处,如有错误之处,请指出。