import asyncio
import aiohttp
from lxml import etree
import queue
urlQ = queue.Queue()
f = open("title22.txt", "w",encoding='utf-8')
async def get_html(url):
ck = """Hm_lvt_dbc355aef238b6c32b43eacbbf161c3c=1507966069,1509850072,1509851337,1509851651; Hm_lpvt_dbc355aef238b6c32b43eacbbf161c3c=1509851653"""
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.101 Safari/537.36",
"Referer": url,
"Cookie": ck,
"Host": "www.mzitu.com"
}
async with asyncio.Semaphore(5): # 限制并发数为5个
async with aiohttp.ClientSession() as session:
async with session.get(url, headers=headers) as html:
# print(resp.status)
response = await html.text(encoding="utf-8")
# print(response)
return response
async def parse():
while True: # 记住这个while true
if urlQ.empty():
break
url = urlQ.get()
html = await get_html(url)
selector = etree.HTML(html)
titles = selector.xpath("//div[@class='postlist']//li/a/img/@alt")
for title in titles:
f.write(title + '\n')
asyncio.ensure_future(parse())
urls = ["http://www.mzitu.com/page/{}/".format(i) for i in range(1, 157)]
for url in urls:
urlQ.put(url)
loop = asyncio.get_event_loop()
tasks = [parse() for _ in range(50)] # 这里,如果parse里没有while true的话,这里只会有50个任务,意思就是只会抓取前50页,就结束程序了
loop.run_until_complete(asyncio.wait(tasks))
loop.close()
f.close()
附上几个协程爬虫相关的链接
这个里面有对请求的封装,加ua、cookie什么的异步爬虫: async/await 与 aiohttp的使用,以及例子
Python中异步协程的使用方法介绍这是篇文章很棒
aiottp中文文档