利用多协程和队列,来爬取豆瓣图书Top250(书名,作者,评分)并存储csv

记录一下 自己的爬虫 学习历史!

from gevent import monkey
monkey.patch_all()
import time
import gevent
from gevent.queue import Queue
import  requests
from bs4 import BeautifulSoup
import csv
#利用多协程和队列,来爬取豆瓣图书Top250(书名,作者,评分)并存储csv

csv_file = open('books.csv','w',newline='',encoding='utf-8')
writer = csv.writer(csv_file)

url = 'https://book.douban.com/top250?'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.183 Safari/537.36'}
pagesize = 25
startpage = 0
start = time.time()
work =Queue()

for x in range(3):
    params = {'start': startpage + x*pagesize}
    work.put_nowait(params)

def oo():
    while not work.empty():
        param = work.get_nowait()

        res = requests.get(url,params=param,headers=headers)
        #print(res.status_code)
        bs = BeautifulSoup(res.text,'html.parser')
        fd = bs.find_all('tr',class_="item")
        list = []

    for i in fd:
        title = i.find('div',class_="pl2")
        title_tag = title.find('a')
        title_name = title_tag.text.replace(' ','').replace('\n','')
        zuozhe = i.find('p',class_="pl")
        zuozhe_name = zuozhe.text
        pingfen = i.find('span',class_="rating_nums")
        fen = pingfen.text+'分'
        print(title_name,zuozhe_name,fen)
        writer.writerow([title_name,zuozhe_name,fen])

tasks_list = []
for y in range(3):
    task = gevent.spawn(oo)
    tasks_list.append(task)
gevent.joinall(tasks_list)
end = time.time()
print(end - start)
csv_file.close()
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值