python爬虫爬取图书信息

Acel丶

已于 2023-04-07 14:22:48 修改

阅读量1.6k

点赞数 1

文章标签： python

于 2023-03-31 20:23:15 首次发布

本文链接：https://blog.csdn.net/weixin_51506849/article/details/129887892

版权

代码示例展示了如何利用Python的requests和lxml库，结合XPath解析HTML，爬取并提取豆瓣Top250图书的名称、作者、出版社、评分等信息，并存储到文本文件中。程序还涉及到了请求头伪装和数据清洗。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

初学python爬虫的xpath，今天用它完成对豆瓣top250图书的爬取

话不多说，直接上代码，后面有解释。

import fake_useragent
import requests
from lxml import etree

if __name__ == '__main__':

    url = 'https://book.douban.com/top250?start={}'
    fp = open('./book-250.txt', 'w', encoding='utf8')
    for i in range(0, 250, 25):
        book_url = url.format(i)
        res = requests.get(url=book_url, headers={
            'User-Agent': fake_useragent.UserAgent().random
        })
        tree = etree.HTML(res.text)
        # name = tree.xpath('//div[@class="pl2"]/a/text()')
        # print(name)
        li_names = tree.xpath('//*[@id="content"]/div/div[1]/div/table')
        # print(li_names)
        # break
        for names in li_names:
            book_name = names.xpath('.//div[@class="pl2"]/a/text()')[0]
            print(book_name.strip())
            fp.write(book_name.strip() + '\n')
            # break

1、url的拼接，

注意这里的start ，每页25条数据，所以用for循环，将十页都加上。

2、寻得book_name的数据后，用strip（）将每条数据的空格消除，

3、写入到txt文件时，+'\n'将数据一行行的写出。

结果为：

对上述进行行补充，爬取图书的所有内容：

并且成功修改了各种确实数据：

import requests
import time
import re
from lxml import etree
import os

# 爬取豆瓣图书250的details

if __name__ == '__main__':
    url = 'https://book.douban.com/top250?start={}'
    head = {
        'User-Agent': fake_useragent.UserAgent().random
    }
    f = open('./book_details.txt', 'w', encoding='utf8')

    for i in range(0, 250, 25):
        book_url = url.format(i)
        res = requests.get(url=book_url, headers=head)
        tree = etree.HTML(res.text)
        # if not os.path.exists('book_pic{}—{}'.format(i + 1, i + 25)):
        #     os.mkdir('book_pic{}—{}'.format(i + 1, i + 25))
        table_details = tree.xpath('//div[@id="content"]/div/div[1]/div/table')
        # 爬取图书的名字，图片，作者，出版社，评分，时间，价格，简介
        for table in table_details:
            book_name = table.xpath('.//div[@class="pl2"]/a/text()')[0].strip()
            print(book_name)
            book_pic_url = table.xpath('.//a[@class="nbg"]/img/@src')[0]
            # print(book_pic_url)
            pic_res = requests.get(url=book_pic_url, headers=head)
            pic_res.encoding = 'utf8'
            # f_pic = open('./book_pic{}—{}/'.format(i + 1, i + 25) + book_name + '.jpg', 'wb')
            # f_pic.write(pic_res.content)
            # f_pic.close()
            print(book_pic_url)
            book_info = table.xpath('.//p[@class="pl"]/text()')[0].split(' / ')
            print(book_info)
            book_authors = book_info[0:len(book_info) - 3]
            book_publish = book_info[- 3].strip()
            book_time = book_info[- 2].strip()
            book_price = book_info[- 1].strip()

            if len(book_authors) != 0:
                book_author = book_authors[0]
                print(book_author)
            else:
                continue
            print(book_publish)
            print(book_time)
            print(book_price)

            book_score = table.xpath('.//div[@class="star clearfix"]/span[2]/text()')[0].strip()
            book_evaluates = table.xpath('.//div[@class="star clearfix"]/span[3]/text()')[0].strip()
            print(book_score)
            # print(type(book_evaluates))
            # print(book_evaluates)
            # print(len(book_evaluates))
            # print(book_evaluates[1:len(book_evaluates) - 1])
            book_evaluate = book_evaluates[1:len(book_evaluates) - 1].strip()
            print(book_evaluate)
            # m = re.match(r'(\()(.*)(\))', book_evaluates)
            # print(m)
            # book_evaluate = m.group(2).strip()
            # print(book_evaluate)
            # print(book_evaluate)
            book_intro = table.xpath('.//p[@class="quote"]/span/text()')
            # print(book_intro)
            if len(book_intro) != 0:
                book_intro = table.xpath('.//p[@class="quote"]/span/text()')[0]
                print(book_intro)
            else:
                continue
            # time.sleep(1)
            f.write(
                book_name + book_author + book_publish + book_time + book_price + book_score + book_evaluate +
                book_intro[0] + '\n')
            # break
    f.close()