# asyncio是没有实现request异步的,没有涉及http协议
# 没必要自己实现http协议,可以直接使用aiohttp
# aiohttp已实现高并发的webserver
# sanic的高性能是号称媲美go语言的,实现了高并发的web服务器
# 使用aiohttp的client端实现爬虫
# asyncio爬虫,去重、入库(用异步驱动完成数据库的入库,使用aiomysql)
# 爬取目标:www.jobbole.com
# 爬取策略:获取页面中的所有URL,判断是否为文章详情页
import aiohttp
import asyncio
import re
from pyquery import PyQuery
import aiomysql
from pyquery import PyQuery
# http://www.lfd.uci.edu/~gohlke/pythonlibs/ # 当python安装各种库有问题时,课用
start_url = "http://www.jobbole.com"
waiting_urls = [] # 可以用list,也可以用queue
seen_urls = set() # 已爬取的url,如果有上亿条数据,就不适合用set了
stopping = False # 设置变量stopping作为事件循环的控制
# 做3个并发
sem = asyncio.Semaphore(3)
# 从服务器返回html
async def fetch(url,session):
# 由于并发比较高,所以不要每次获取数据都要建立连接,可以使用同一个session,通过传参的方式就好
# async with aiohttp.ClientSession() as session:
async with sem:
await asyncio.sleep(1)
try:
async with session.get(url) as resp:
# 获取状态码进行判断
print("url status:{}".format(resp.status))
if resp.status in [200,201]:
data = await resp.text()
return resp.text
except Exception as e:
print(e)
# 实现爬取策略,解析获取可爬取的url
def extract_urls(html):
urls = []
pq = PyQuery(html)
for i in pq.items("a"):
url = link.attr("href")
if url and url.startswith(http) and url not in seen_urls:
urls.append(url)
waitting_urls.append(url)
return urls
# 异步获取可爬取的url
async def init_urls(url,session):
html = await fetch(url,session)
seen_urls.add(url)
# 无需获取返回,因为在extract_urls中,已将url加入到waitting_urls中了
extract_urls(html)
async def article_handler(url,session,pool):
# 获取文章详情并解析入库
html = await fetch(url,session)
extract_urls(html)
pq = PyQuery(html)
title = pq("title").text()
# pool.acquire()是获取一个连接
async with pool.acquire() as conn:
async with conn.cursor() as cur:
await cur.execute("SELECT 42;")
# 可以使用navicat进行对数据库的操作(建库建表)
insert_sql = "insert into article_test(title) values('{}')".format(title)
await cur.execute(insert_sql)
# 消费者consumer:从waitting_urls中不停地爬取数据,取到数据就扔到协程asyncio中,
async def consumer(pool):
while not stopping:
# 当队列为空的时候,等待一下,否则pop时会报错
if len(waitting_urls)==0:
await asyncio.sleep(0.5)
continue # 避免频繁发送请求
# 否则,如果不在seen_urls中,
url = waitting_urls.pop()
print("start get url:{}".format(url))
# 判断是否为详情页的url,如果是且不在seen_urls中,则对url进行文章提取解析
if re.match('http://.*?jobbole.com/\d+/', url):
if url not in seen_urls:
asyncio.ensure_future(article_handler(url, session, pool))
await asyncio.sleep(30) # 避免发送过多请求
else:
if url not in seen_urls:
asyncio.ensure_future(init_urls(url, session))
else:
if url not in seen_urls:
asyncio.ensure_future(init_urls(url))
async def main():
# 等待mysql连接建立好,要设置chartset才能插入中文数据,autocommit也必须要设置才能提交数据
pool = await aiomysql.create_pool(host='127.0.0.1',port=3306,
user='root',passwor='',db='aiomysql_test',
loop=loop,charset="utf8",autocommit=True)
# 由于async with在创建完session后,会自动调用close()将session关闭,因此可以在前期就创建好
async with aiohttp.ClientSession() as session:
html = await fetch(start_url,session)
seen_urls.add(start_url)
# 无需获取返回,因为在extract_urls中,已将url加入到waitting_urls中了
extract_urls(html)
# asyncio.ensure_future(init_urls(start_url))
aysncio.ensure_future(consumer(pool))
if __name__ == "__main__":
loop = asyncio.get_event_loop()
asyncio.ensure_future(main(loop))
loop.run_forever()
14-高并发爬虫aiohttp
最新推荐文章于 2024-04-07 20:29:04 发布