记录一下 自己的爬虫 学习历史!
from gevent import monkey
monkey.patch_all()
import time
import gevent
from gevent.queue import Queue
import requests
from bs4 import BeautifulSoup
import csv
#利用多协程和队列,来爬取豆瓣图书Top250(书名,作者,评分)并存储csv
csv_file = open('books.csv','w',newline='',encoding='utf-8')
writer = csv.writer(csv_file)
url = 'https://book.douban.com/top250?'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.183 Safari/537.36'}
pagesize = 25
startpage = 0
start = time.time()
work =Queue()
for x in range(3):
params = {'start': startpage + x*pagesize}
work.put_nowait(params)
def oo():
while not work.empty():
param = work.get_nowait()
res = requests.get(url,params=param,headers=headers)
#print(res.status_code)
bs = BeautifulSoup(res.text,'html.parser')
fd = bs.find_all('tr',class_="item")
list = []
for i in fd:
title = i.find('div',class_="pl2")
title_tag = title.find('a')
title_name = title_tag.text.replace(' ','').replace('\n','')
zuozhe = i.find('p',class_="pl")
zuozhe_name = zuozhe.text
pingfen = i.find('span',class_="rating_nums")
fen = pingfen.text+'分'
print(title_name,zuozhe_name,fen)
writer.writerow([title_name,zuozhe_name,fen])
tasks_list = []
for y in range(3):
task = gevent.spawn(oo)
tasks_list.append(task)
gevent.joinall(tasks_list)
end = time.time()
print(end - start)
csv_file.close()