异步爬虫
- 基于线程池
- 基于单线程+多任务的异步爬虫
Flask的基本使用
-
环境安装:pip install flask
-
创建一个py源文件
from flask import Flask, render_template from time import sleep # 实例化一个app app = Flask(__name__) # 创建视图函数&路由地址 @app.route('/Hz') def index_1(): sleep(2) return render_template('test.html') @app.route('/jay') def index_2(): sleep(2) return render_template('test.html') @app.route('/tom') def index_3(): sleep(2) return render_template('test.html') if __name__ == "__main__": # debug=True开启调试模式:服务器代码被修改后按下保存键会自动重启 app.run(debug=True)
-
test.html
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" id="html">
<head><meta http-equiv="Cache-Control" content="no-siteapp" /><meta http-equiv="Cache-Control" content="no-transform " /><meta http-equiv="Content-Type" content="text/html; charset=UTF-8" /><title>
我的收藏_古诗文网
</title>
<script src="/js/jquery-3.2.1.min.js" type="text/javascript"></script>
<style type="text/css">
.mainshoucang{width:705px; height:34px; line-height:34px; border-bottom:1px solid #d7d5bc;overflow:hidden; float:left; clear:left;}
.mainshoucang span{float:right; margin-top:15px; margin-left:5px; color:#65645F;}
.mainshoucang a{float:right; margin-top:15px; margin-left:5px;}
</style>
<script type="text/javascript">
if ((navigator.userAgent.match(/(phone|pad|pod|iPhone|iPod|ios|iPad|Android|Mobile|BlackBerry|IEMobile|MQQBrowser|JUC|Fennec|wOSBrowser|BrowserNG|WebOS|Symbian|Windows Phone)/i))) {
window.location.href = "https://m.gushiwen.cn/user/collect.aspx?type=s&p=1&id=2070621&sort=t";
} else {
}
</script>
<link href="/css/skinSo20210826.css" rel="stylesheet" type="text/css" />
<script src="/js/jquery-3.2.1.min.js" type="text/javascript"></script>
<script type="text/javascript">
//取得cookie值
function getCookie(name) {
var arr, reg = new RegExp("(^| )" + name + "=([^;]*)(;|$)");
if (arr = document.cookie.match(reg))
return unescape(arr[2]);
else
return null;
}
//判断收藏
function selectLike(id) {
document.getElementById('likeImg' + id).name = parseInt(document.getElementById('likeImg' + id).name) + 1;
if (document.getElementById('likeImg' + id).name == '1') {
var idsShigeLaiyo = getCookie('idsShiwen2017');
if (idsShigeLaiyo != null && idsShigeLaiyo != '') {
var ids = idsShigeLaiyo.split(',');
for (var i = 0; i < ids.length; i++) {
if (ids[i] == id) {
document.getElementById('likeImg' + id).src = 'https://song.gushiwen.cn/siteimg/shou-cangok.png';
document.getElementById('likeImg' + id).alt = '已收藏';
break;
}
}
}
}
}
//判断收藏名句
function selectLikeMingju(id) {
document.getElementById('likeImg' + id).name = parseInt(document.getElementById('likeImg' + id).name) + 1;
if (document.getElementById('likeImg' + id).name == '1') {
var idsShigeLaiyo = getCookie('idsMingju2017');
if (idsShigeLaiyo != null && idsShigeLaiyo != '') {
var ids = idsShigeLaiyo.split(',');
for (var i = 0; i < ids.length; i++) {
if (ids[i] == id) {
document.getElementById('likeImg' + id).src = 'https://song.gushiwen.cn/siteimg/shou-cangok.png';
document.getElementById('likeImg' + id).alt = '已收藏';
break;
}
}
}
}
}
//判断收藏作者
function selectLikeAuthor(id) {
document.getElementById('likeImg' + id).name = parseInt(document.getElementById('likeImg' + id).name) + 1;
if (document.getElementById('likeImg' + id).name == '1') {
var idsShigeLaiyo = getCookie('idsAuthor2017');
if (idsShigeLaiyo != null && idsShigeLaiyo != '') {
var ids = idsShigeLaiyo.split(',');
for (var i = 0; i < ids.length; i++) {
if (ids[i] == id) {
document.getElementById('likeImg' + id).src = 'https://song.gushiwen.cn/siteimg/shou-cangok.png';
document.getElementById('likeImg' + id).alt = '已收藏';
break;
}
}
}
}
}
//判断收藏古籍
function selectLikeGuwen(id) {
document.getElementById('likeImg' + id).name = parseInt(document.getElementById('likeImg' + id).name) + 1;
if (document.getElementById('likeImg' + id).name == '1') {
var idsShigeLaiyo = getCookie('idsGuji2017');
if (idsShigeLaiyo != null && idsShigeLaiyo != '') {
var ids = idsShigeLaiyo.split(',');
for (var i = 0; i < ids.length; i++) {
if (ids[i] == id) {
document.getElementById('likeImg' + id).src = 'https://song.gushiwen.cn/siteimg/shou-cangok.png';
document.getElementById('likeImg' + id).alt = '已收藏';
break;
}
}
}
}
}
</script>
<script>
var _hmt = _hmt || [];
(function () {
var hm = document.createElement("script");
hm.src = "//hm.baidu.com/hm.js?9007fab6814e892d3020a64454da5a55";
var s = document.getElementsByTagName("script")[0];
s.parentNode.insertBefore(hm, s);
})();
</script>
</head>
<body onclick="closeshowBos()">
<div class="main1">
<div class="cont">
<div class="left">
<a href="https://www.gushiwen.cn/">古诗文网</a>
</div>
<div class="right">
<div class="son1">
<a style="margin-left:1px;" href="https://www.gushiwen.cn/">推荐</a>
<a href="/shiwens/">诗文</a>
<a href="/mingjus/">名句</a>
<a href="/authors/">作者</a>
<a href="/guwen/">古籍</a>
<a href="/user/collect.aspx" rel="nofollow" style="background-color:#757863;border-bottom:3px solid #F0EFE2;line-height:43px; height:43px;">我的</a>
<a style="width:65px;" href="/app/DefaultGwd.aspx" target="_blank">手机版</a>
</div>
<div class="son2">
<div class="search">
<form action="/search.aspx" onsubmit="return selectSearch()" contentType="text/html; charset=utf-8">
<input onkeydown="noajaxkeyUp()" onfocus="setInterval('showBos()',1000)" id="txtKey" name="value" type="text" value="" maxlength="40" autocomplete="off" style="height:25px; line-height:25px; float:left; padding-left:5px; width:260px; font-size:14px; clear:left; border:0px;" />
<input type="submit" style="float:right; width:24px; height:24px; clear:right; margin-top:2px; margin-right:3px; background-image:url(https://song.gushiwen.cn/siteimg/docSearch.png); background-repeat:no-repeat; background-size:24px 24px; border:0px;cursor:pointer;" value="" />
<input id="b" style="display:none;" type="text" />
</form>
</div>
<div id="box"></div>
</div>
</div>
</div>
</div>
<div style="width:1000px; clear:both; margin-left:auto; margin-right:auto; margin-top:20px; overflow:hidden;">
<div class="mainshoucang">
<span style=" height:34px; line-height:34px;font-size:18px; font-weight:bold; float:left; margin:0px; color:#0F0F0F;">我的收藏</span>
<span style=" height:34px; line-height:34px;font-size:18px; font-weight:bold; float:left; margin:0px; color:#0F0F0F; margin-left:10px; margin-right:10px;">/</span>
<a style=" height:34px; line-height:34px;font-size:18px; font-weight:bold; float:left; margin:0px;" href="/user/collectbei.aspx?sort=t">我的背诵</a>
<a href="/user/collect.aspx?type=s&id=2070621&sort=z">字母排序</a>
<span>/</span>
<span>时间排序</span>
</div>
<div style="width:265px;height:34px; line-height:34px; border-bottom:1px solid #C5C5C5;overflow:hidden; clear:right; float:right;font-size:14px;">
账号管理
</div>
</div>
<div id="mainSearch" class="mainSearch">
<div class="searchleft">
<a style="color:#FFFFFF;background-image:url(https://song.gushiwen.cn/siteimg/seachimg.jpg); background-repeat:no-repeat;">诗文</a>
<a href="/user/collect.aspx?type=m&id=2070621&sort=t">名句</a>
<a href="/user/collect.aspx?type=a&id=2070621&sort=t">作者</a>
<a href="/user/collect.aspx?type=d&id=2070621&sort=t">古籍</a>
</div>
<div class="left">
<img src="shoucangdemo.jpg" style=" margin-top:20px;" alt="收藏教程" width="620" height="556" />
<form id="FromPage" method="get" action="/user/collect.aspx" onsubmit="return PageSubmit()">
<div class="pagesright">
<a class="amore" style=" color:#808080;background-color:#e7e6d8;width:380px;">下一页</a>
<a style=" color:#808080;background-color:#e7e6d8;">上一页</a>
<span style=" background-color:#E1E0C7; border:0px; margin-top:22px; width:auto;">/ 1页</span>
<span class="curent"><input id="putpage" name="p" value="1" autocomplete="off" onblur="SubPage()" /></span>
<label id="temppage" style="display:none;">1</label>
<label id="sumPage" style="display:none;">1</label>
<input type="hidden" name="sort" value="t" />
<input type="hidden" name="id" value="2070621" />
</div>
</form>
</div>
<div class="right">
<div class="shisoncont">
<div class="line"><a href="/user/modifypwd.aspx?from=http://so.gushiwen.cn/user/collect.aspx">设新密码</a><span>未设置</span></div>
<div class="line"><a href="/user/bandemail.aspx?from=http://so.gushiwen.cn/user/collect.aspx">绑定邮箱</a><span>*****g@protonmail.com</span></div>
<div class="line"><a href="/user/bandphone.aspx?from=http://so.gushiwen.cn/user/collect.aspx">绑定手机号</a><span>未绑定</span></div>
<div class="line"><a id="bwxhao" style="cursor:pointer;">绑定公众号</a><span id="bwxbool"></span></div>
<div class="line"><a href="/user/loginlose.aspx?from=http://so.gushiwen.cn/user/collect.aspx">退出登录</a></div>
<div class="line" style=" border-bottom:0px;"><a href="/user/userDel.aspx">删除账号</a></div>
</div>
<div id="threeWeixin" style="display:none;">
<div class="hide-center" id="hide-center">
<div id="formhead">
<div id="formhead-title">
绑定公众号(可扫码登录)
</div>
<button type="button" id="close" style="color:#bcba9e;">X</button>
</div>
<div id="formbody">
<img id="erweimaCanshu" width="210" height="210" src="" alt="" />
</div>
</div>
</div>
<script type="text/javascript">
//取得cookie值
function getCookie(name) {
var arr, reg = new RegExp("(^| )" + name + "=([^;]*)(;|$)");
if (arr = document.cookie.match(reg))
return unescape(arr[2]);
else
return null;
}
$("#close").click(function () {
$("#threeWeixin").fadeOut("slow");
clearInterval(intervalErweima);
})
var timesRun = 0;
var scene_id = Math.floor((Math.random() * 9999999) + 100000000);
//判断是否为登录用户但未关注公众号
var wxopenid = getCookie('wxopenid');
if (wxopenid = null) {
document.getElementById('bwxbool').innerHTML = '未绑定'
}
else {
document.getElementById('bwxbool').innerHTML = '已绑定'
}
var threeWeixinID = document.getElementById('threeWeixin');
var erweimaShow = 0;
$("#bwxhao").click(function () {
//仅改变定位
if (threeWeixinID.style.display != 'none') {
document.getElementById('hide-center').style.top = $(window).scrollTop() * 2 + "px";
return;
}
setTimeout(showErweima, 1000);
})
function showErweima() {
//获取二维码
var xmlhttp;
if (window.XMLHttpRequest) {// code for IE7+, Firefox, Chrome, Opera, Safari
xmlhttp = new XMLHttpRequest();
}
else {// code for IE6, IE5
xmlhttp = new ActiveXObject("Microsoft.XMLHTTP");
}
xmlhttp.onreadystatechange = function () {
if (xmlhttp.readyState == 4 && xmlhttp.status == 200) {
document.getElementById('erweimaCanshu').src = "https://mp.weixin.qq.com/cgi-bin/showqrcode?ticket=" + xmlhttp.responseText;
}
}
xmlhttp.open("GET", "/getTicket.aspx?scene_id=" + scene_id, false);
xmlhttp.send();
document.getElementById('hide-center').style.top = $(window).scrollTop() * 2 + "px";
$("#threeWeixin").fadeIn("slow");
timesRun = 0;
//判断码是否被扫
var intervalErweima = setInterval("selectErweima()", "2000");
}
function selectErweima() {
//60秒后停止
timesRun = timesRun + 1;
if (timesRun > 60) {
$("#threeWeixin").fadeOut("slow");
clearInterval(intervalErweima);
}
var xmlhttp;
if (window.XMLHttpRequest) {// code for IE7+, Firefox, Chrome, Opera, Safari
xmlhttp = new XMLHttpRequest();
}
else {// code for IE6, IE5
xmlhttp = new ActiveXObject("Microsoft.XMLHTTP");
}
xmlhttp.onreadystatechange = function () {
if (xmlhttp.readyState == 4 && xmlhttp.status == 200) {
if (xmlhttp.responseText != "未扫码") {
$("#threeWeixin").fadeOut("slow");
erweimaShow = 1;
clearInterval(intervalErweima);
}
}
}
xmlhttp.open("POST", "/getEventKey.aspx?&scene_id=" + scene_id, false);
xmlhttp.send();
}
</script>
<div class="juzioncont">
<img style=" float:left; margin:10px; margin-right:0px;" src="https://song.gushiwen.cn/siteimg/app/appdownGwd2021.png" width="80" height="80" /><p><center style="font-size:18px; margin-top:14px;">扫码下载</center></p><p><center style="font-size:18px; margin-top:5px;">古文岛客户端</center></p>
</div>
<div class="juzioncont">
<img style=" float:left; margin:10px; margin-right:0px;" src="https://song.gushiwen.cn/siteimg/app/erma_guwendao.png" width="80" height="80" /><p><center style="font-size:18px; margin-top:14px;">扫码关注</center></p><p><center style="font-size:18px;margin-top:5px;">古文岛公众号</center></p>
</div>
</div>
</div>
<div class="main4">
© 2021 <a href="https://www.gushiwen.cn/">古诗文网</a> | <a href="https://so.gushiwen.cn/shiwens/">诗文</a> | <a href="https://so.gushiwen.cn/mingjus/">名句</a> | <a href="https://so.gushiwen.cn/authors/">作者</a> | <a href="https://so.gushiwen.cn/guwen/">古籍</a> | <a href="/jiucuo.aspx?u=" target="_blank" rel="nofollow">纠错</a>
</div>
<script type="text/javascript">
window.onload = function () {
setIframeHeight(document.getElementById('external-frame'));
};
</script>
<script defer="defer" src="/js/skinso20210709.js" type="text/javascript"></script>
</body>
</html>
同步代码
import time
import requests
def get_request(url):
response = requests.get(url=url).text
return len(response)
# 同步代码
if __name__ == '__main__':
# 开始时间
start = time.time()
urls = [
'http://127.0.0.1:5000/Hz',
'http://127.0.0.1:5000/jay',
'http://127.0.0.1:5000/tom',
]
for url in urls:
res = get_request(url)
print(res)
print('总耗时:', time.time() - start)
异步代码
import time
import requests
# 线程池
from multiprocessing.dummy import Pool
def get_request(url):
response = requests.get(url=url).text
return len(response)
# 同步代码
# if __name__ == '__main__':
# # 开始时间
# start = time.time()
#
# urls = [
# 'http://127.0.0.1:5000/Hz',
# 'http://127.0.0.1:5000/jay',
# 'http://127.0.0.1:5000/tom',
# ]
#
# for url in urls:
# res = get_request(url)
# print(res)
# print('总耗时:', time.time() - start)
# 异步代码
if __name__ == '__main__':
urls = [
'http://127.0.0.1:5000/Hz',
'http://127.0.0.1:5000/jay',
'http://127.0.0.1:5000/tom',
]
start = time.time()
pool = Pool(3) # 开启线程数量
# 使用get_request作为回调函数,需要基于异步的形式对urls列表中的元素进行操作
# 保证回调函数必须要有一个参数和返回值
result_list = pool.map(get_request, urls)
print(result_list)
print('总耗时:', time.time() - start)
线程池
# 线程池
from multiprocessing.dummy import Pool
# 开启线程数量
pool = Pool(3)
#可以使用callback对alist中的每一个元素进行指定形式的一步操作
result_list = pool.map(callback, alist)