Python学习之豆瓣读书信息的爬取

最新推荐文章于 2024-05-30 13:04:26 发布

灌木丛中的微风

最新推荐文章于 2024-05-30 13:04:26 发布

阅读量1k

点赞数 36

文章标签： python 学习 windows

本文链接：https://blog.csdn.net/m0_58050808/article/details/136921804

版权

1：爬取豆瓣读书中作品的信息

import fake_useragent
import requests
from lxml import etree

url = "https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4"

# 爬取豆瓣图书的信息
fp = open("./doubantushu.txt", "w+", encoding="utf-8")

head = {
    "User-Agent": fake_useragent.UserAgent().random

}

response = requests.get(url, headers=head)

response_text = response.text

# 使用 etree.HTML(text) 将字符串格式的 html 片段解析成 html 文档
tree = etree.HTML(response_text)

# 获取所有的li表单
li_list = tree.xpath("//ul[@class='subject-list']/li")

# 循环遍历li,从中取出想要的数据取出
for url in li_list:
    
    # 取出书名，该书名为a标签中的title属性
    book_list = li.xpath(".//div[2]/h2/a/@title")[0]
    writer_list = li.xpath(".//div[2]/div[1]/text()")[0]
    # print(writer_list.replace("/", "")
    # replace()方法返回的是更改后的原字符串的副本，原字符串并不受影响.strip()方法直接在原字符串上修改
    #strip()方法可以在循环内用，replace()可以在最后写入时使用

    # list_ = writer_list[0]
    evaluation_list = li.xpath(".//div[2]/div[2]/span/text()")[0]
    sketch_list = li.xpath(".//div[2]/p/text()")[0]
    price_list = li.xpath(".//div[2]/div[3]/div[2]/span/a/text()")[0]

    print(book_list[0], writer_list[0], evaluation_list[0], sketch_list[0],price_list[0])
    fp.write(book_list.replace("\n", "").replace(" ", "")+","+writer_list.replace(" ", "").replace("\n", "").replace(" ", "")+","+evaluation_list.replace("\n", "").replace(" ", "")
            +","+sketch_list.replace("\n", "").replace(" ", "")+","+price_list.replace("\n", "").replace(" ", "")+"\n")

fp.close()

2：爬取豆瓣读书中作品进一步的详细信息（作品主页的详细信息）

import fake_useragent
import requests
from lxml import etree

url = "https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4"

# 爬取豆瓣图书的信息
fp = open("./doubantushu.txt2", "w+", encoding="utf-8")

head = {
    "User-Agent": fake_useragent.UserAgent().random

}

response = requests.get(url, headers=head)

response_text = response.text

# 使用 etree.HTML(text) 将字符串格式的 html 片段解析成 html 文档
tree = etree.HTML(response_text)

# 更为详细的获取每本书的简介
url_list = tree.xpath("//ul[@class='subject-list']/li/div[2]//a/@href")

# 循环遍历li,从中取出想要的数据取出
for urll in url_list:
    resp = requests.get(urll, headers=head)
    resp_text = resp.text
    tree1 = etree.HTML(resp_text)
    book_list = tree1.xpath("//div[@id='wrapper']/h1/span/text()")[0]
    director_list = tree1.xpath("//div[@class='subject clearfix']/div[2]/span[1]/a/text()")[0]
    publish_list = tree1.xpath("//div[@class='subject clearfix']/div[2]/a[1]/text()")[0]
    # 不是没本书都有出品方，当该书籍没有出品方时用此来做判断，使得程序不会报错
    producer_list = tree1.xpath("//div[@class='subject clearfix']/div[2]/a[2]/text()")[0] if \
    tree1.xpath("//div[@class='subject clearfix']/div[2]/a[2]/text()")[0] else "0"
    # 获取弟节点 /following-sibling::text()[1]
    year_list = tree1.xpath("//div[@class='subject clearfix']/div[2]/span[@class='pl' "
                            "and text()='出版年:']/following-sibling::text()[1]")[0].strip()
    page_list = tree1.xpath("//div[@class='subject clearfix']/div[2]/span[@class='pl' "
                            "and text()='页数:']/following-sibling::text()[1]")[0].strip()
    price_list = tree1.xpath("//div[@class='subject clearfix']/div[2]/span[@class='pl' "
                             "and text()='定价:']/following-sibling::text()[1]")[0].strip()
    pool_list = tree1.xpath("//div[@class='subject clearfix']/div[2]/span[@class='pl' "
                            "and text()='装帧:']/following-sibling::text()[1]")[0].strip()
    ISBN_list = tree1.xpath("//div[@class='subject clearfix']/div[2]/span[@class='pl' "
                            "and text()='ISBN:']/following-sibling::text()[1]")[0].strip()
    series_list = tree1.xpath("//div[@class='subject clearfix']/div[2]/a[3]/text()")[0]

    print(book_list, director_list, publish_list, year_list, page_list, price_list, pool_list, series_list, ISBN_list)

    fp.write(
        book_list + "," + director_list + "," + publish_list + "," + year_list + "," + page_list + "," + price_list + "," + pool_list + "," + series_list + "," + ISBN_list)

    # break
fp.close()