当涉及到爬虫翻页、多线程和异步的实例时,可以考虑以下的示例代码:
import requests
import threading
import asyncio
from bs4 import BeautifulSoup
# 爬取网页的函数
def crawl_page(url):
response = requests.get(url)
# 解析网页内容
soup = BeautifulSoup(response.text, 'html.parser')
# 提取所需信息
# ...
# 多线程爬取网页
def multi_thread_crawl():
urls = ['url1', 'url2', 'url3'] # 要爬取的网页链接列表
threads = []
for url in urls:
t = threading.Thread(target=crawl_page, args=(url,))
threads.append(t)
t.start()
for t in threads:
t.join()
# 异步爬取网页
async def async_crawl_page(url):
response = await aiohttp.ClientSession().get(url)
html = await response.text()
# 解析网页内容
soup = BeautifulSoup(html, 'html.parser')
# 提取所需信息
# ...
async def async_crawl():
urls = ['url1', 'url2', 'url3'] # 要爬取的网页链接列表
tasks = []
for url in urls:
task = asyncio.create_task(async_crawl_page(url))
tasks.append(task)
await asyncio.gather(*tasks)
# 翻页爬取
def crawl_multiple_pages():
base_url = 'http://example.com/page='
for i in range(1, 6): # 爬取1到5页
url = base_url + str(i)
crawl_page(url)
以上是一个简单的示例,展示了如何使用多线程和异步进行爬虫翻页。