爬取某网站小说

最新推荐文章于 2024-04-21 13:49:53 发布

快乐乐学习

最新推荐文章于 2024-04-21 13:49:53 发布

阅读量91

点赞数

分类专栏： python基础学习文章标签： 1024程序员节

本文链接：https://blog.csdn.net/weixin_47276949/article/details/134024348

版权

python基础学习专栏收录该内容

2 篇文章 0 订阅

订阅专栏

1. 登录

def login():
    """
    登录
    :return:
    """
    url = "https://passport.17k.com/ck/user/login"
    data = {
        "loginName": "18291962907",
        "password": "cjl2000321"
    }
    data = requests.post(url, data=data).json()
    if data['data']['nickname'] == "又大又白又圆又软":
        print("登录成功")
        return True

2. 选择完结和全本免费的书籍

def book_type():
    """
    获取书名与URL
    :return:
    """
    book_dict = {}
    url = 'https://www.17k.com/all/book/2_0_0_0_3_0_1_0_1.html'
    res = requests.get(url).text
    for value in re.findall('<a target="_blank" href="//www.17k.com/book/(.*?)</a>', res):
        data = value.split('" >')
        book_dict[data[1]] = data[0]
    return book_dict


# 选择已完本、免费的书籍
def screen():
    """
    选择已完本、免费的书籍
    :return: 返回book_type函数
    """
    url_li = {"已完本": "https://static.17k.com/js/mod/WXShare.js?v=0911",
              "只看免费": "https://static.17k.com/js/mod/WXShare.js?v=0911"}
    param = {
        "v": "0911"
    }
    for key, url in url_li.items():
        data = requests.get(url, params=param).text
        if "设置成功" in data:
            print(key, "设置成功")
    return book_type()

3. 获取筛选后这个页面的所有书籍的名称和URL

def get_book_id(li):
    """
    进入书籍页，获取详情页url地址
    :param li:
    :return:
    """
    url = f"https://www.17k.com/book/{li}"
    res = requests.get(url)
    res.encoding = 'utf-8'
    for book_detail_url in re.findall(r'href="(.*?)"', res.text):
        if len(book_detail_url) == 18 and "list" in book_detail_url:
            return book_detail_url

4. 进入书籍详情页点击阅读获取所有章节名称和URL

def get_section_url(book_detail_url):
    """
    获取章节链接
    :param book_detail_url:
    :param url_set:
    :return:
    """
    chapter_dict = {}  # 存放章节名称和URL
    url = f"https://www.17k.com{book_detail_url}"
    res = requests.get(url, headers=headers)
    res.encoding = 'utf-8'
    chapter_name = re.findall('title="(.*?)&', res.text)  # 章节名称
    chapter_url = re.findall('href="/chapter(.*?)"', res.text)  # 章节URL
    for index, value in enumerate(chapter_name):
        chapter_dict[value] = chapter_url[index]
    return chapter_dict

5. 下载书籍

用异步下载很快，秒完成

async def fetch(chapter_dict, book_path):

    async with aiohttp.ClientSession() as session:
        tasks = [asyncio.create_task(download_book(session, name, url, book_path)) for name, url in chapter_dict.items()]
        await asyncio.wait(tasks)


async def download_book(session, name, url, book_path):

    url = f"https://www.17k.com/chapter{url}"
    async with session.get(url, headers=headers) as res:
        res.encoding = 'utf-8'
        data = await res.text()
        paragraphs = re.findall(r'<div class="p">(.*?)</div>', data, re.DOTALL)     # 内容
        file = os.path.join(book_path, name)
        with open(f"{file}.txt", mode="w", encoding="utf-8") as fp:
            fp.write(re.findall('<h1>(.*?)</h1>', data)[0])         # 章节名称
            fp.write("\n".join(re.findall('<p>(.*?)</p>', paragraphs[0].strip())))
            fp.close()

6.完整代码

"""
#!/usr/bin/env python
# -*- coding:utf-8 -*-
@Project : 图书信息爬取2.py
@File : 17k小说.py
@Author : 18291962907
@Time : 2023/10/23 0:21
"""
import requests, re, os, asyncio, aiohttp

excel_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "17K小说")
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 Edg/118.0.2088.61",
    "Cookie": "GUID=b42b4087-dc18-4711-854b-ed1509d29600; c_channel=0; c_csc=web; Hm_lvt_9793f42b498361373512340937deb2a0=1697815882,1697991226,1698066050; accessToken=avatarUrl%3Dhttps%253A%252F%252Fcdn.static.17k.com%252Fuser%252Favatar%252F03%252F43%252F67%252F102216743.jpg-88x88%253Fv%253D1697991806000%26id%3D102216743%26nickname%3D%25E5%258F%2588%25E5%25A4%25A7%25E5%258F%2588%25E7%2599%25BD%25E5%258F%2588%25E5%259C%2586%25E5%258F%2588%25E8%25BD%25AF%26e%3D1713618111%26s%3D8d33aa31493ead9d; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%22102216743%22%2C%22%24device_id%22%3A%2218b4db665e33c3-0415888f7d54dd-745d5774-2073600-18b4db665e4e62%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_referrer%22%3A%22%22%2C%22%24latest_referrer_host%22%3A%22%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%7D%2C%22first_id%22%3A%22b42b4087-dc18-4711-854b-ed1509d29600%22%7D; Hm_lpvt_9793f42b498361373512340937deb2a0=1698071298"
}


def lord_file():
    if not os.path.exists(excel_path):
        os.makedirs(excel_path)
    else:
        print("已存在")


def book_file(book_name):
    """创建书籍文件夹"""
    book_path = os.path.join(excel_path, book_name)
    os.makedirs(book_path)
    print(book_name, "创建成功")
    return book_path


# 登录
def login():
    """
    登录
    :return:
    """
    url = "https://passport.17k.com/ck/user/login"
    data = {
        "loginName": "18291962907",
        "password": "cjl2000321"
    }
    data = requests.post(url, data=data).json()
    if data['data']['nickname'] == "又大又白又圆又软":
        print("登录成功")
        return True



# 获取书名与URL
def book_type():
    """
    获取书名与URL
    :return:
    """
    book_dict = {}
    url = 'https://www.17k.com/all/book/2_0_0_0_3_0_1_0_1.html'
    res = requests.get(url).text
    for value in re.findall('<a target="_blank" href="//www.17k.com/book/(.*?)</a>', res):
        data = value.split('" >')
        book_dict[data[1]] = data[0]
    return book_dict


# 选择已完本、免费的书籍
def screen():
    """
    选择已完本、免费的书籍
    :return: 返回book_type函数
    """
    url_li = {"已完本": "https://static.17k.com/js/mod/WXShare.js?v=0911",
              "只看免费": "https://static.17k.com/js/mod/WXShare.js?v=0911"}
    param = {
        "v": "0911"
    }
    for key, url in url_li.items():
        data = requests.get(url, params=param).text
        if "设置成功" in data:
            print(key, "设置成功")
    return book_type()


# 进入书籍页，获取详情页url地址
def get_book_id(li):
    """
    进入书籍页，获取详情页url地址
    :param li:
    :return:
    """
    url = f"https://www.17k.com/book/{li}"
    res = requests.get(url)
    res.encoding = 'utf-8'
    for book_detail_url in re.findall(r'href="(.*?)"', res.text):
        if len(book_detail_url) == 18 and "list" in book_detail_url:
            return book_detail_url


# 获取章节链接、章节名称
def get_section_url(book_detail_url):
    """
    获取章节链接
    :param book_detail_url:
    :param url_set:
    :return:
    """
    chapter_dict = {}  # 存放章节名称和URL
    url = f"https://www.17k.com{book_detail_url}"
    res = requests.get(url, headers=headers)
    res.encoding = 'utf-8'
    chapter_name = re.findall('title="(.*?)&', res.text)  # 章节名称
    chapter_url = re.findall('href="/chapter(.*?)"', res.text)  # 章节URL
    for index, value in enumerate(chapter_name):
        chapter_dict[value] = chapter_url[index]
    return chapter_dict


async def fetch(chapter_dict, book_path):

    async with aiohttp.ClientSession() as session:
        tasks = [asyncio.create_task(download_book(session, name, url, book_path)) for name, url in chapter_dict.items()]
        await asyncio.wait(tasks)


async def download_book(session, name, url, book_path):

    url = f"https://www.17k.com/chapter{url}"
    async with session.get(url, headers=headers) as res:
        res.encoding = 'utf-8'
        data = await res.text()
        paragraphs = re.findall(r'<div class="p">(.*?)</div>', data, re.DOTALL)     # 内容
        file = os.path.join(book_path, name)
        with open(f"{file}.txt", mode="w", encoding="utf-8") as fp:
            fp.write(re.findall('<h1>(.*?)</h1>', data)[0])         # 章节名称
            fp.write("\n".join(re.findall('<p>(.*?)</p>', paragraphs[0].strip())))
            fp.close()


# 启动函数
def run():
    lord_file()
    # 1.登录
    login()
    # 2.在书籍分类页选择已完成、全部免费得数据，并返回这些书籍得Name和URL，返回的结果字典：为键值对
    data = screen()
    # 3.循环遍历书籍Name和URL的字典
    for book_name, book_url in data.items():
        # 4. 判断是否存书籍Name的文件夹，不存在则创建，返回文件夹路径
        book_path = book_file(book_name)
        # 5. 进入书籍简介页面，获取详情页面URL
        book_detail_url = get_book_id(book_url)
        # 6. 获取不到时，跳过此次循环
        if not book_detail_url:
            continue
        # 7. 获取书籍详情页的章节名称和章节URL地址
        chapter_dict = get_section_url(book_detail_url)
        # 8. 异步下载内容
        asyncio.run(fetch(chapter_dict, book_path))




if __name__ == '__main__':
    run()