诗词名句网爬取四大名著

诗词名句网:

URL = https://www.shicimingju.com/book/index.html

1、获取页面Html内容

def get_html(url):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
    }

    res = requests.get(url=url, headers=headers)
    res.encoding = res.apparent_encoding
    # print(res.text)

    soup = BeautifulSoup(res.text, "lxml")
    return soup

通过bs4获取到页面内容,可以从HTML中提取数据,所以将获取html封装成一个方法,返回一个soup对象

2、获取书名以及书的链接

分析四个书名都在div card book_card中,直接用soup对象找到对应位置

然后再找里面li,遍历出来后去里面的;

那么a.text就为书名。

而href即为base_url + (a.attrs['href'])

定义一个空字典。书名为key,href为value

最终返回书的字典

def get_book(soup):

    book_obj = soup.find(name="div", attrs={"class": "card booknark_card"})
    li_list = book_obj.findAll(name="li")

    book_dict = {}
    base_url = "https://www.shicimingju.com/"
    for li in li_list:
        # print(li)
        a = li.find(name="a")
        book_name = a.text
        book_url = base_url + (a.attrs['href'])
        book_dict[book_name] = book_url

    return book_dict

 返回字典后遍历一下:

 book_dict = get_book(get_html(url))
    # print(book_dict)
    for book, book_href in book_dict.items():
        print(book, book_href)

 3、获取书的目录及目录对应链接

第2步时拿到了书的链接,通过get_html方法获取页面元素,传入到get_mulu方法中

分析到目录都在class为"book-mulu"的div中。通过bs4模块找到对应内容,再存到一个名为mulu_dict的字典中,目录标题为key,目录链接为value,注意目录链接也需要拼接成完整的url;

返回该字典

def get_mulu(mulu_html):
    mulu_dict = {}
    book_mulu = mulu_html.find("div", class_="book-mulu")
    # print(book_mulu)
    mulu_list = book_mulu.find_all("a")
    for mulu in mulu_list:
        mulu_title = mulu.text
        # print(mulu_title)
        mulu_href = "https://www.shicimingju.com" + (mulu.attrs['href'])
        # print(mulu_href,mulu_title)
        mulu_dict[mulu_title] = mulu_href
    # print(mulu_dict)
    return mulu_dict

 4、获取章节内容

通过分析每个章节的内容都在class名为"chapter_content"的div中,通过bs4模块找到对应位置,

content_html为get_html(章节链接)获得的

接下来就是老样子,定义空字典,key存入章节标题。value存入章节内容

def get_mulu_content(mulu_title, content_html):
    content_dict = {}
    chapter_content = content_html.find("div", class_="chapter_content")
    content = chapter_content.text
    content_dict[mulu_title] = content
    # print(content_dict)
    return content_dict

5、下载保存

def save_book(book, content_title, content_dict):
    '''

    :param book:  书名,通过书名创建对应的文件夹存放章节
    :param content_title:  内容名称即章节名称,用来作为文件名
    :param content_dict:    内容字典,value为章节内容
    :return:
    '''
    if not os.path.exists(book):
        os.mkdir(book)

    path = os.path.join(book)

    with open(path + "/{}.txt".format(content_title), mode="w", encoding="utf-8") as f:
        f.write(content_dict[content_title])
        print(f"{book} === {content_title} 下载完成!!!!!!")

6、main函数

def main(url):
    book_dict = get_book(get_html(url))
    # print(book_dict)
    # 遍历出书名以及书的链接
    for book, book_href in book_dict.items():
        print(book, book_href)
        # 通过传入书url进入到get_html方法获得目录的html内容
        mulu_html = get_html(book_href)
        # 在通过get_mulu方法,传入mulu_html获得目录字典
        mulu_dict = get_mulu(mulu_html)
        # 遍历目录字典
        for mulu_title in mulu_dict:
            # 获取章节html
            content_html = get_html(mulu_dict[mulu_title])
            # 获取章节内容字典
            content_dict = get_mulu_content(mulu_title, content_html)
            # 遍历章节内容字典,调用下载保存方法保存
            for content_title in content_dict:
                save_book(book, content_title, content_dict)

完整代码

import requests
from bs4 import BeautifulSoup
import os


def get_html(url):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
    }

    res = requests.get(url=url, headers=headers)
    res.encoding = res.apparent_encoding
    # print(res.text)

    soup = BeautifulSoup(res.text, "lxml")
    return soup


# print(div_card)


def get_book(soup):
    # 获取
    book_obj = soup.find(name="div", attrs={"class": "card booknark_card"})
    li_list = book_obj.findAll(name="li")

    book_dict = {}
    base_url = "https://www.shicimingju.com/"
    for li in li_list:
        # print(li)
        a = li.find(name="a")
        book_name = a.text
        book_url = base_url + (a.attrs['href'])
        book_dict[book_name] = book_url

    return book_dict


def get_mulu(mulu_html):
    mulu_dict = {}
    book_mulu = mulu_html.find("div", class_="book-mulu")
    # print(book_mulu)
    mulu_list = book_mulu.find_all("a")
    for mulu in mulu_list:
        mulu_title = mulu.text
        # print(mulu_title)
        mulu_href = "https://www.shicimingju.com" + (mulu.attrs['href'])
        # print(mulu_href,mulu_title)
        mulu_dict[mulu_title] = mulu_href
    # print(mulu_dict)
    return mulu_dict


def get_mulu_content(mulu_title, content_html):
    content_dict = {}
    chapter_content = content_html.find("div", class_="chapter_content")
    content = chapter_content.text
    content_dict[mulu_title] = content
    # print(content_dict)
    return content_dict


def save_book(book, content_title, content_dict):
    '''

    :param book:  书名,通过书名创建对应的文件夹存放章节
    :param content_title:  内容名称即章节名称,用来作为文件名
    :param content_dict:    内容字典,value为章节内容
    :return:
    '''
    if not os.path.exists(book):
        os.mkdir(book)

    path = os.path.join(book)

    with open(path + "/{}.txt".format(content_title), mode="w", encoding="utf-8") as f:
        f.write(content_dict[content_title])
        print(f"{book} === {content_title} 下载完成!!!!!!")


def main(url):
    book_dict = get_book(get_html(url))
    # print(book_dict)
    # 遍历出书名以及书的链接
    for book, book_href in book_dict.items():
        print(book, book_href)
        # 通过传入书url进入到get_html方法获得目录的html内容
        mulu_html = get_html(book_href)
        # 在通过get_mulu方法,传入mulu_html获得目录字典
        mulu_dict = get_mulu(mulu_html)
        # 遍历目录字典
        for mulu_title in mulu_dict:
            # 获取章节html
            content_html = get_html(mulu_dict[mulu_title])
            # 获取章节内容字典
            content_dict = get_mulu_content(mulu_title, content_html)
            # 遍历章节内容字典,调用下载保存方法保存
            for content_title in content_dict:
                save_book(book, content_title, content_dict)


if __name__ == '__main__':
    url = "https://www.shicimingju.com/book/index.html"
    main(url)

  • 12
    点赞
  • 9
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值