代码:
# -*- coding:utf-8-*-
# asycio爬虫, 去重, 入库:
import asyncio
import re
import aiohttp
from pyquery import PyQuery
import aiomysql
from urllib.parse import urljoin, urlparse
import re
import requests
loop = asyncio.get_event_loop()
start_url = "https://news.cnblogs.com/"
waitting_urls = []
seen_urls = set()
stopping = False
sem = asyncio.Semaphore(1)# 控制并发
async def fetch(url, session):
async with sem:
# await asyncio.sleep(0.1)
try:
async with session.get(url, ssl=False) as resp:
print("url start:{}".format(resp.status))
if resp.status in [200, 201]:
data = await resp.text()
return data
except Exception as e:
print(e)
# async def init_url(session):
# html = await fetch(start_url, session)
# extract_url(html)
async def new_handle(url, session):
html = await fetch(url, session)
pq = PyQuery(html)
title = pq("title").text().replace("新闻_博客园", "")
seen_urls.add(url)
print(title)
def extract_url(html):
urls = []
pq = PyQuery(html)
for link in pq.items('a'):
url = link.attr("href")
url = urljoin(start_url, url)
if url and url.startswith("https:") and url not in seen_urls and re.match("https://news.cnblogs.com/n/\d+/", url):
urls.append(url)
waitting_urls.append(url)
print(waitting_urls)
print("seen_urls",seen_urls)
print("********")
async def consumer():
async with aiohttp.ClientSession() as session:
while not stopping:
if len(waitting_urls) == 0:
asyncio.sleep(0.2)
try:
url = waitting_urls.pop()
print("start get url:{}".format(url))
if re.match("https://news.cnblogs.com/n/\d+/", url):
if url not in seen_urls:
asyncio.ensure_future(new_handle(url, session))
except Exception as e:
await asyncio.sleep(0.1)
print("************************")
async def main(loop):
# 等待mysql连接建立好
# pool = await aiomysql.create_pool(
# host='127.0.0.1', port=3306,
# user='root', password='', db='mysql',
# loop=loop, charset="utf8", autocommit=True
# )
async with aiohttp.ClientSession() as session1:
html = await fetch(start_url, session1)
seen_urls.add(start_url)
extract_url(html)
asyncio.ensure_future(consumer())
if __name__ == '__main__':
loop = asyncio.get_event_loop()
asyncio.ensure_future(main(loop))
loop.run_forever()