Python爬虫学习(四)
线程池
一次性开辟一些线程。用户直接给线程池提交任务,线程任务的调度交给线程池来完成
from concurrent.futures import ThreadPoolExecutor
def fn(name):
for i in range(100):
print(name, i)
if __name__ == '__main__':
# 创建线程池
with ThreadPoolExecutor(50) as t:
for i in range(100):
t.submit(fn, name=f'线程{i}')
# 等待线程池中的任务全部执行完毕,才继续执行(守护)
print("123")
获取北京新发地网页中的蔬菜价格等信息并写入csv文件
思路:
- 先实现提取单个页面的数据
- 上线程池,多个页面同时抓取
# 爬取北京新发地网页中的蔬菜价格等信息并写入csv文件
import csv
import time
import requests
from concurrent.futures import ThreadPoolExecutor
hearders = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36 Edg/127.0.0.0',
}
f = open('data_file/' + 'veg_price_data_multi_thread.csv', mode='w', encoding='utf-8')
csvwriter = csv.writer(f)
def download_one_page(url, page):
data = {
'limit': '20',
'current': page,
}
# 拿到页面源代码
resp = requests.post(url, data=data, headers=hearders)
# print(resp.status_code)
resp.encoding = 'utf-8'
# print(resp.json())
vegPriceList = dict(resp.json())['list']
for veg in vegPriceList:
veg_name = veg['prodName']
veg_lowPrice = veg['lowPrice']
veg_avgPrice = veg['avgPrice']
veg_hightPrice = veg['highPrice']
veg_place = veg['place']
veg_list = [veg_name, veg_lowPrice, veg_avgPrice, veg_hightPrice, veg_place]
# print(veg_name, veg_lowPrice, veg_avgPrice, veg_hightPrice, veg_place)
csvwriter.writerow(veg_list)
print(f'第{page}页,提取成功!')
if __name__ == '__main__':
url = 'http://www.xinfadi.com.cn/getPriceData.html'
# download_one_page(url, 10)
# 单线程下载数据
# for i in range(1, 100):
# download_one_page(url, i)
# time.sleep(3)
# 线程池下载数据
with ThreadPoolExecutor(50) as t:
for i in range(1, 100):
# 把下载任务提交给线程池
t.submit(download_one_page, url=url, page=i)
# 注意:不加会导致爬取几页就结束,应该是网页增加了反爬取机制,但是加了会导致线程池的性能失效
time.sleep(1)
print('全部下载完毕!')