python爬虫爬取图书信息

代码示例展示了如何利用Python的requests和lxml库,结合XPath解析HTML,爬取并提取豆瓣Top250图书的名称、作者、出版社、评分等信息,并存储到文本文件中。程序还涉及到了请求头伪装和数据清洗。
摘要由CSDN通过智能技术生成

初学python爬虫的xpath,今天用它完成对豆瓣top250图书的爬取

话不多说,直接上代码,后面有解释。

import fake_useragent
import requests
from lxml import etree

if __name__ == '__main__':

    url = 'https://book.douban.com/top250?start={}'
    fp = open('./book-250.txt', 'w', encoding='utf8')
    for i in range(0, 250, 25):
        book_url = url.format(i)
        res = requests.get(url=book_url, headers={
            'User-Agent': fake_useragent.UserAgent().random
        })
        tree = etree.HTML(res.text)
        # name = tree.xpath('//div[@class="pl2"]/a/text()')
        # print(name)
        li_names = tree.xpath('//*[@id="content"]/div/div[1]/div/table')
        # print(li_names)
        # break
        for names in li_names:
            book_name = names.xpath('.//div[@class="pl2"]/a/text()')[0]
            print(book_name.strip())
            fp.write(book_name.strip() + '\n')
            # break

1、url的拼接,

注意这里的start ,每页25条数据,所以用for循环,将十页都加上。

 2、寻得book_name的数据后,用strip()将每条数据的空格消除,

3、写入到txt文件时,+'\n'将数据一行行的写出。

结果为:

对上述进行行补充,爬取图书的所有内容:

并且成功修改了各种确实数据:

import requests
import time
import re
from lxml import etree
import os

# 爬取豆瓣图书250的details

if __name__ == '__main__':
    url = 'https://book.douban.com/top250?start={}'
    head = {
        'User-Agent': fake_useragent.UserAgent().random
    }
    f = open('./book_details.txt', 'w', encoding='utf8')

    for i in range(0, 250, 25):
        book_url = url.format(i)
        res = requests.get(url=book_url, headers=head)
        tree = etree.HTML(res.text)
        # if not os.path.exists('book_pic{}—{}'.format(i + 1, i + 25)):
        #     os.mkdir('book_pic{}—{}'.format(i + 1, i + 25))
        table_details = tree.xpath('//div[@id="content"]/div/div[1]/div/table')
        # 爬取图书的名字,图片,作者,出版社,评分,时间,价格,简介
        for table in table_details:
            book_name = table.xpath('.//div[@class="pl2"]/a/text()')[0].strip()
            print(book_name)
            book_pic_url = table.xpath('.//a[@class="nbg"]/img/@src')[0]
            # print(book_pic_url)
            pic_res = requests.get(url=book_pic_url, headers=head)
            pic_res.encoding = 'utf8'
            # f_pic = open('./book_pic{}—{}/'.format(i + 1, i + 25) + book_name + '.jpg', 'wb')
            # f_pic.write(pic_res.content)
            # f_pic.close()
            print(book_pic_url)
            book_info = table.xpath('.//p[@class="pl"]/text()')[0].split(' / ')
            print(book_info)
            book_authors = book_info[0:len(book_info) - 3]
            book_publish = book_info[- 3].strip()
            book_time = book_info[- 2].strip()
            book_price = book_info[- 1].strip()

            if len(book_authors) != 0:
                book_author = book_authors[0]
                print(book_author)
            else:
                continue
            print(book_publish)
            print(book_time)
            print(book_price)

            book_score = table.xpath('.//div[@class="star clearfix"]/span[2]/text()')[0].strip()
            book_evaluates = table.xpath('.//div[@class="star clearfix"]/span[3]/text()')[0].strip()
            print(book_score)
            # print(type(book_evaluates))
            # print(book_evaluates)
            # print(len(book_evaluates))
            # print(book_evaluates[1:len(book_evaluates) - 1])
            book_evaluate = book_evaluates[1:len(book_evaluates) - 1].strip()
            print(book_evaluate)
            # m = re.match(r'(\()(.*)(\))', book_evaluates)
            # print(m)
            # book_evaluate = m.group(2).strip()
            # print(book_evaluate)
            # print(book_evaluate)
            book_intro = table.xpath('.//p[@class="quote"]/span/text()')
            # print(book_intro)
            if len(book_intro) != 0:
                book_intro = table.xpath('.//p[@class="quote"]/span/text()')[0]
                print(book_intro)
            else:
                continue
            # time.sleep(1)
            f.write(
                book_name + book_author + book_publish + book_time + book_price + book_score + book_evaluate +
                book_intro[0] + '\n')
            # break
    f.close()

评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值