爬取豆瓣前十页小说内容

import requests, time, re, csv
from bs4 import BeautifulSoup
import codecs
import re

with open(r'C:\Users\Administrator\Desktop\小说.csv', 'ab+')as fp:
    fp.write(codecs.BOM_UTF8)
f = open(r'C:\Users\Administrator\Desktop\小说.csv','a+',newline='', encoding='utf-8')
writer = csv.writer(f)
writer.writerow(('名称','作者','评分','人数','简介'))
r'C:\Users\Administrator\Desktop\小说.csv'
urls = ['https://book.douban.com/tag/小说?start={}&type=T/'.format(str(i)) for i in range(0,1001,20)]

headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36'
}
for url in urls:
    res = requests.get(url, headers=headers)
    #print(res.text)  
# print(res.content.decode('utf-8')) 
    html = res.text
    soup = BeautifulSoup(html, 'lxml')
#   print(soup)
    infos = soup.select(".subject-item")
#     print(infos)
    for it in infos:
        name = it.h2.a['title']
        # print(name)
        author = it.select_one(".pub").text.strip()
        # print(author)
        score = it.select_one('.rating_nums').text
        # print(score)
        num1 = it.select_one('.pl').text.strip()
        num2 = re.findall("\d+",num1)[0]
        # print(num2)
        content = it.p.text
        writer.writerow((name,author,score,num,content))
        time.sleep(1)
f.close() 

 

参考:https://blog.csdn.net/zxcjxx/article/details/105317054

  • 1
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值