Python爬虫:异步方式提高爬虫效率
python多线程语法
第一种方式
from threading import Thread
t = Thread(target=fuc)
t.start() #多线程状态可以为开始工作状态,具体执行时间由CPU决定
另一种方式
class MyThread(Thread):
def run(self):
XXXXXX
t = MyThread()
t.start()
python多进程语法(不常用)
from multiprocessing import Process
p = Process(tarfet=func)
p.start()
线程池和进程池
线程池:一次性开辟一些线程,用户直接给线程池提交任务,线程任务的调度交给线程池来完成。
from concurrent.futures import ThreadPoolExecutor,ProcessPoolExecutor
with ThreadPoolExecutor(50) as t:
for i in range(100):
t.submit(fn,name=XXXX)
实例:使用线程池爬取人邮教育数据信息
思路:
1.提取单个页面的数据
2.使用线程池,多个页面同时抓取
代码:
import requests
from lxml import etree
import csv
import time
from concurrent.futures import ThreadPoolExecutor
f = open("book2.csv",mode="w",newline='')
csvwriter = csv.writer(f)
def download_one_page(url):
resp = requests.get(url)
# print(resp.text)
html = etree.HTML(resp.text)
divs = html.xpath("/html/body/div[3]/div/div/div/div/ul/li/div[2]")
for div in divs:
name = div.xpath("./h4/a/text()")[0]
# print(name)
author = div.xpath("./div/span/text()")[0].strip()
# print(author)
price = div.xpath("./span/span/text()")[0].strip("¥")
# print(price)
csvwriter.writerow([name, author, price])
time.sleep(1)
resp.close()
if __name__ == '__main__':
with ThreadPoolExecutor(50) as t:
for i in range(0,38):
t.submit(download_one_page,f"https://www.ryjiaoyu.com/tag/details/7/?page={i}")
print("over!")
f.close()
协程
协程:当程序遇到了IO操作时,可以选择性切换到其他任务上。
python编写协程
import asyncio
async def func1():
XXXX
async def fucn2():
XXXX
async def fucn3():
XXXX
async def main():
# g = func() #此时函数是异步协程函数,此时函数执行得到的是一个协程对象
tasks = [
asyncio.create_task(func1())
asyncio.create_task(func2())
asyncio.create_task(func3())
]
await asyncio.wait(tasks)
if __name__ == '__main__':
asyncio.run(main())
异步操作的sleep
await asyncio.sleep(3) #异步操作时的代码
在爬虫中的应用
async def download(url):
print("loading")
## 异步操作的网络请求
print("over")
async def main():
urls = [
"XXXX",
"XXXX",
“XXXXX"
]
tasks = []
for url in urls:
d = download(url)
tasks.append(d)
await asyncio.wait(tasks)
if __name__ == '__main__':
asyncio.run(main())
异步操作的网络请求
安装
pip install aiohttp
Python编写网络请求
import asyncio
import aiohttp
urls = [
"XXXXX"
"XXXXX"
"XXXXX"
]
async def aiodownload(url):
async with aiohttp.ClientSession() as session:
# session.get() <==> requests.get()
# session.post()<==> requests.post()
async with session.get(url) as resp:
resp.text()
# 写入文件
with open(name,mode="wb") as f:
f.write(await resp.content.read()) #读取的是异步内容,需要await挂起
async def main():
tasks = []
for url in urls:
tasks.append(aiodownload(url))
await asyncio.wait(tasks)
if __name__ == '__main__':
asyncio.run(main())