豆瓣图书top250爬取

最新推荐文章于 2024-04-21 14:16:50 发布

旧火车

最新推荐文章于 2024-04-21 14:16:50 发布

阅读量262

点赞数

分类专栏： python 文章标签： xpath python

本文链接：https://blog.csdn.net/weixin_43729137/article/details/108756618

版权

python 专栏收录该内容

2 篇文章 0 订阅

订阅专栏

# 导入相应的库文件
from lxml import etree
import urllib.request
import csv

# 创建csv
fp = open('doubanbooktop250.csv', 'wt', newline='', encoding='utf-8')
writer = csv.writer(fp)
writer.writerow(('name', 'url', 'author', 'publisher', 'date', 'price',  'rate', 'comment'))
base_url = "https://book.douban.com/top250?start={0}"
# 构造url
def get_max_page():
    urls = []#返回的url储存到一个列表中
    for page in range(0,250,25):
        url_format = base_url.format(str(page))
        urls.append(url_format)
    return urls
urls=get_max_page()

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 '
                  '(KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36'
}

for url in urls:
    request = urllib.request.Request(url,headers=headers)
    response = urllib.request.urlopen(request)
    htmls = response.read().decode("utf-8")
    content = etree.HTML(htmls)
    infos = content.xpath('//tr[@class="item"]')
    for info in infos:
        name = info.xpath('./td/div/a/@title')[0]
        url = info.xpath('./td/div/a/@href')[0]
        book_infos = info.xpath('./td/p/text()')[0]
        author = book_infos.split('/')[0]         #split()方法切片
        publisher = book_infos.split('/')[-3]
        date = book_infos.split('/')[-2]
        price = book_infos.split('/')[-1]
        rate = info.xpath('./td/div/span[2]/text()')[0]
        comments = info.xpath('./td/p/span/text()')
        comment = comments[0] if len(comments) != 0 else "空"#判断评语是否为空
        writer.writerow((name, url, author, publisher, date, price, rate, comment))#结果写入

fp.close()