Python学习之豆瓣读书信息的爬取

1:爬取豆瓣读书中作品的信息

import fake_useragent
import requests
from lxml import etree

url = "https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4"

# 爬取豆瓣图书的信息
fp = open("./doubantushu.txt", "w+", encoding="utf-8")

head = {
    "User-Agent": fake_useragent.UserAgent().random

}

response = requests.get(url, headers=head)

response_text = response.text

# 使用 etree.HTML(text) 将字符串格式的 html 片段解析成 html 文档
tree = etree.HTML(response_text)

# 获取所有的li表单
li_list = tree.xpath("//ul[@class='subject-list']/li")

# 循环遍历li,从中取出想要的数据取出
for url in li_list:
    
    # 取出书名,该书名为a标签中的title属性
    book_list = li.xpath(".//div[2]/h2/a/@title")[0]
    writer_list = li.xpath(".//div[2]/div[1]/text()")[0]
    # print(writer_list.replace("/", "")
    # replace()方法返回的是更改后的原字符串的副本,原字符串并不受影响.strip()方法直接在原字符串上修改
    #strip()方法可以在循环内用,replace()可以在最后写入时使用

    # list_ = writer_list[0]
    evaluation_list = li.xpath(".//div[2]/div[2]/span/text()")[0]
    sketch_list = li.xpath(".//div[2]/p/text()")[0]
    price_list = li.xpath(".//div[2]/div[3]/div[2]/span/a/text()")[0]

    print(book_list[0], writer_list[0], evaluation_list[0], sketch_list[0],price_list[0])
    fp.write(book_list.replace("\n", "").replace(" ", "")+","+writer_list.replace(" ", "").replace("\n", "").replace(" ", "")+","+evaluation_list.replace("\n", "").replace(" ", "")
            +","+sketch_list.replace("\n", "").replace(" ", "")+","+price_list.replace("\n", "").replace(" ", "")+"\n")

fp.close()

2:爬取豆瓣读书中作品进一步的详细信息(作品主页的详细信息)

import fake_useragent
import requests
from lxml import etree

url = "https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4"

# 爬取豆瓣图书的信息
fp = open("./doubantushu.txt2", "w+", encoding="utf-8")

head = {
    "User-Agent": fake_useragent.UserAgent().random

}

response = requests.get(url, headers=head)

response_text = response.text

# 使用 etree.HTML(text) 将字符串格式的 html 片段解析成 html 文档
tree = etree.HTML(response_text)

# 更为详细的获取每本书的简介
url_list = tree.xpath("//ul[@class='subject-list']/li/div[2]//a/@href")

# 循环遍历li,从中取出想要的数据取出
for urll in url_list:
    resp = requests.get(urll, headers=head)
    resp_text = resp.text
    tree1 = etree.HTML(resp_text)
    book_list = tree1.xpath("//div[@id='wrapper']/h1/span/text()")[0]
    director_list = tree1.xpath("//div[@class='subject clearfix']/div[2]/span[1]/a/text()")[0]
    publish_list = tree1.xpath("//div[@class='subject clearfix']/div[2]/a[1]/text()")[0]
    # 不是没本书都有出品方,当该书籍没有出品方时用此来做判断,使得程序不会报错
    producer_list = tree1.xpath("//div[@class='subject clearfix']/div[2]/a[2]/text()")[0] if \
    tree1.xpath("//div[@class='subject clearfix']/div[2]/a[2]/text()")[0] else "0"
    # 获取弟节点 /following-sibling::text()[1]
    year_list = tree1.xpath("//div[@class='subject clearfix']/div[2]/span[@class='pl' "
                            "and text()='出版年:']/following-sibling::text()[1]")[0].strip()
    page_list = tree1.xpath("//div[@class='subject clearfix']/div[2]/span[@class='pl' "
                            "and text()='页数:']/following-sibling::text()[1]")[0].strip()
    price_list = tree1.xpath("//div[@class='subject clearfix']/div[2]/span[@class='pl' "
                             "and text()='定价:']/following-sibling::text()[1]")[0].strip()
    pool_list = tree1.xpath("//div[@class='subject clearfix']/div[2]/span[@class='pl' "
                            "and text()='装帧:']/following-sibling::text()[1]")[0].strip()
    ISBN_list = tree1.xpath("//div[@class='subject clearfix']/div[2]/span[@class='pl' "
                            "and text()='ISBN:']/following-sibling::text()[1]")[0].strip()
    series_list = tree1.xpath("//div[@class='subject clearfix']/div[2]/a[3]/text()")[0]

    print(book_list, director_list, publish_list, year_list, page_list, price_list, pool_list, series_list, ISBN_list)

    fp.write(
        book_list + "," + director_list + "," + publish_list + "," + year_list + "," + page_list + "," + price_list + "," + pool_list + "," + series_list + "," + ISBN_list)

    # break
fp.close()
  • 36
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 1
    评论
爬取大规模数据时需要注意反爬措施,避免被封禁IP等问题。以下是一个大致的爬虫框架,你可以在此基础上进行修改和优化。 ```python import requests from bs4 import BeautifulSoup import time # 设置请求头,模拟浏览器访问 headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"} # 定义一个函数,用于爬取一页中的图书信息 def parse_one_page(url): book_info_list = [] response = requests.get(url, headers=headers) soup = BeautifulSoup(response.text, 'html.parser') book_list = soup.find_all('li', attrs={'class': 'subject-item'}) for book in book_list: # 解析书名、评分等信息 book_name = book.find('div', attrs={'class': 'info'}).find('a').get_text().strip() rating_num = book.find('div', attrs={'class': 'star clearfix'}).find('span', attrs={'class': 'rating_nums'}).get_text().strip() comment_num = book.find('div', attrs={'class': 'star clearfix'}).find('span', attrs={'class': 'pl'}).get_text().strip().replace('(', '').replace(')', '') book_info_list.append((book_name, rating_num, comment_num)) return book_info_list # 定义主函数,用于控制爬取的页数 def main(): book_info_total = [] for i in range(0, 500): url = 'https://book.douban.com/top250?start={}'.format(i * 25) book_info = parse_one_page(url) book_info_total.extend(book_info) time.sleep(1) # 间隔1秒,避免被封禁IP # 将结果保存到文件中 with open('book_info.txt', 'w') as f: for book in book_info_total: f.write('{}\t{}\t{}\n'.format(book[0], book[1], book[2])) if __name__ == '__main__': main() ``` 此代码实现了爬取豆瓣图书Top250的书名、评分和评论数信息,每页25本,共爬取20页。如果要爬取更多数据,可以自行修改页数。需要注意的是,爬取大规模数据时,由于网络环境等原因,可能会出现请求失败、程序卡死等问题,需要适当加入异常处理和重试机制。
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值