豆瓣之互联网书籍名称爬取(新手)

from urllib import request
import  urllib.parse
import  urllib.error
from bs4 import BeautifulSoup
import csv
import io
import sys
import time

sys.stdout = io.TextIOWrapper(sys.stdout.buffer,encoding='gb18030')
# resp = request.urlopen('https://book.douban.com/tag/%E4%BA%92%E8%81%94%E7%BD%91?type=S')
# html_data = resp.read().decode('utf-8')
# # print(html_data)
# soup = BeautifulSoup(html_data,'html.parser')


def get_url_book(url):
    print(url)
    resp = urllib.request.urlopen(url)
    web_data = resp.read().decode('utf-8')
    # print(web_data)
    soup = BeautifulSoup(web_data,'html.parser')
    time.sleep(2)
    return read(soup)

def read(soup):
    nowplaying_book = soup.find_all('ul',class_='subject-list')
    # print(nowplaying_book)
    nowplaying_book_list = nowplaying_book[0].find_all('li',class_='subject-item')
    # print(nowplaying_book_list[0])
    book_list = []
    for item in nowplaying_book_list:
        nowplaying_dict = {}
        for tag_info_item in item.find_all('div',class_='info'):

            nowplaying_dict['name'] = tag_info_item.find_all('a')[0]['title']
            try:
                nowplaying_dict['dec'] = tag_info_item.find_all('p')[0].string
            except:
                nowplaying_dict['dec'] = ''

            nowplaying_dict['pub'] = tag_info_item.find_all('div',class_='pub')[0].string.replace('\n','')
            try:
                nowplaying_dict['rating_nums'] = tag_info_item.find_all('span',class_='rating_nums')[0].string
            except:
                nowplaying_dict['rating_nums'] = '(少于10人评价)'
            nowplaying_dict['people'] = tag_info_item.find_all('span',class_='pl')[0].string.replace('\n','')
            # print(nowplaying_dict)
            book_list.append(nowplaying_dict)
    # print(book_list)
    return book_list
    # print(book_list)

list = []
for a in range(3):
    url = 'https://book.douban.com/tag/%E4%BA%92%E8%81%94%E7%BD%91?start={}&type=T'.format(a*20)
    data = get_url_book(url)
    list+=data
    print(url)
print(list)
with open("book.csv","w",encoding='gb18030',newline='') as datacsv:
    csvwriter = csv.writer(datacsv,dialect=("excel"))
    csvwriter.writerow(["名称","描述","作者/出版社/价格","评分","评分人数"])
    for item in list:
        csvwriter.writerow([item['name'],item['dec'],item['pub'],item['rating_nums'],item['people']])

print("ok")

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值