爬取某网站小说

1. 登录
def login():
    """
    登录
    :return:
    """
    url = "https://passport.17k.com/ck/user/login"
    data = {
        "loginName": "18291962907",
        "password": "cjl2000321"
    }
    data = requests.post(url, data=data).json()
    if data['data']['nickname'] == "又大又白又圆又软":
        print("登录成功")
        return True
2. 选择完结和全本免费的书籍
def book_type():
    """
    获取书名与URL
    :return:
    """
    book_dict = {}
    url = 'https://www.17k.com/all/book/2_0_0_0_3_0_1_0_1.html'
    res = requests.get(url).text
    for value in re.findall('<a target="_blank" href="//www.17k.com/book/(.*?)</a>', res):
        data = value.split('" >')
        book_dict[data[1]] = data[0]
    return book_dict


# 选择已完本、免费的书籍
def screen():
    """
    选择已完本、免费的书籍
    :return: 返回book_type函数
    """
    url_li = {"已完本": "https://static.17k.com/js/mod/WXShare.js?v=0911",
              "只看免费": "https://static.17k.com/js/mod/WXShare.js?v=0911"}
    param = {
        "v": "0911"
    }
    for key, url in url_li.items():
        data = requests.get(url, params=param).text
        if "设置成功" in data:
            print(key, "设置成功")
    return book_type()
3. 获取筛选后这个页面的所有书籍的名称和URL
def get_book_id(li):
    """
    进入书籍页,获取详情页url地址
    :param li:
    :return:
    """
    url = f"https://www.17k.com/book/{li}"
    res = requests.get(url)
    res.encoding = 'utf-8'
    for book_detail_url in re.findall(r'href="(.*?)"', res.text):
        if len(book_detail_url) == 18 and "list" in book_detail_url:
            return book_detail_url
4. 进入书籍详情页点击阅读获取所有章节名称和URL
def get_section_url(book_detail_url):
    """
    获取章节链接
    :param book_detail_url:
    :param url_set:
    :return:
    """
    chapter_dict = {}  # 存放章节名称和URL
    url = f"https://www.17k.com{book_detail_url}"
    res = requests.get(url, headers=headers)
    res.encoding = 'utf-8'
    chapter_name = re.findall('title="(.*?)&', res.text)  # 章节名称
    chapter_url = re.findall('href="/chapter(.*?)"', res.text)  # 章节URL
    for index, value in enumerate(chapter_name):
        chapter_dict[value] = chapter_url[index]
    return chapter_dict
5. 下载书籍

用异步下载很快,秒完成

async def fetch(chapter_dict, book_path):

    async with aiohttp.ClientSession() as session:
        tasks = [asyncio.create_task(download_book(session, name, url, book_path)) for name, url in chapter_dict.items()]
        await asyncio.wait(tasks)


async def download_book(session, name, url, book_path):

    url = f"https://www.17k.com/chapter{url}"
    async with session.get(url, headers=headers) as res:
        res.encoding = 'utf-8'
        data = await res.text()
        paragraphs = re.findall(r'<div class="p">(.*?)</div>', data, re.DOTALL)     # 内容
        file = os.path.join(book_path, name)
        with open(f"{file}.txt", mode="w", encoding="utf-8") as fp:
            fp.write(re.findall('<h1>(.*?)</h1>', data)[0])         # 章节名称
            fp.write("\n".join(re.findall('<p>(.*?)</p>', paragraphs[0].strip())))
            fp.close()
6.完整代码
"""
#!/usr/bin/env python
# -*- coding:utf-8 -*-
@Project : 图书信息爬取2.py
@File : 17k小说.py
@Author : 18291962907
@Time : 2023/10/23 0:21
"""
import requests, re, os, asyncio, aiohttp

excel_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "17K小说")
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 Edg/118.0.2088.61",
    "Cookie": "GUID=b42b4087-dc18-4711-854b-ed1509d29600; c_channel=0; c_csc=web; Hm_lvt_9793f42b498361373512340937deb2a0=1697815882,1697991226,1698066050; accessToken=avatarUrl%3Dhttps%253A%252F%252Fcdn.static.17k.com%252Fuser%252Favatar%252F03%252F43%252F67%252F102216743.jpg-88x88%253Fv%253D1697991806000%26id%3D102216743%26nickname%3D%25E5%258F%2588%25E5%25A4%25A7%25E5%258F%2588%25E7%2599%25BD%25E5%258F%2588%25E5%259C%2586%25E5%258F%2588%25E8%25BD%25AF%26e%3D1713618111%26s%3D8d33aa31493ead9d; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%22102216743%22%2C%22%24device_id%22%3A%2218b4db665e33c3-0415888f7d54dd-745d5774-2073600-18b4db665e4e62%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_referrer%22%3A%22%22%2C%22%24latest_referrer_host%22%3A%22%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%7D%2C%22first_id%22%3A%22b42b4087-dc18-4711-854b-ed1509d29600%22%7D; Hm_lpvt_9793f42b498361373512340937deb2a0=1698071298"
}


def lord_file():
    if not os.path.exists(excel_path):
        os.makedirs(excel_path)
    else:
        print("已存在")


def book_file(book_name):
    """创建书籍文件夹"""
    book_path = os.path.join(excel_path, book_name)
    os.makedirs(book_path)
    print(book_name, "创建成功")
    return book_path


# 登录
def login():
    """
    登录
    :return:
    """
    url = "https://passport.17k.com/ck/user/login"
    data = {
        "loginName": "18291962907",
        "password": "cjl2000321"
    }
    data = requests.post(url, data=data).json()
    if data['data']['nickname'] == "又大又白又圆又软":
        print("登录成功")
        return True



# 获取书名与URL
def book_type():
    """
    获取书名与URL
    :return:
    """
    book_dict = {}
    url = 'https://www.17k.com/all/book/2_0_0_0_3_0_1_0_1.html'
    res = requests.get(url).text
    for value in re.findall('<a target="_blank" href="//www.17k.com/book/(.*?)</a>', res):
        data = value.split('" >')
        book_dict[data[1]] = data[0]
    return book_dict


# 选择已完本、免费的书籍
def screen():
    """
    选择已完本、免费的书籍
    :return: 返回book_type函数
    """
    url_li = {"已完本": "https://static.17k.com/js/mod/WXShare.js?v=0911",
              "只看免费": "https://static.17k.com/js/mod/WXShare.js?v=0911"}
    param = {
        "v": "0911"
    }
    for key, url in url_li.items():
        data = requests.get(url, params=param).text
        if "设置成功" in data:
            print(key, "设置成功")
    return book_type()


# 进入书籍页,获取详情页url地址
def get_book_id(li):
    """
    进入书籍页,获取详情页url地址
    :param li:
    :return:
    """
    url = f"https://www.17k.com/book/{li}"
    res = requests.get(url)
    res.encoding = 'utf-8'
    for book_detail_url in re.findall(r'href="(.*?)"', res.text):
        if len(book_detail_url) == 18 and "list" in book_detail_url:
            return book_detail_url


# 获取章节链接、章节名称
def get_section_url(book_detail_url):
    """
    获取章节链接
    :param book_detail_url:
    :param url_set:
    :return:
    """
    chapter_dict = {}  # 存放章节名称和URL
    url = f"https://www.17k.com{book_detail_url}"
    res = requests.get(url, headers=headers)
    res.encoding = 'utf-8'
    chapter_name = re.findall('title="(.*?)&', res.text)  # 章节名称
    chapter_url = re.findall('href="/chapter(.*?)"', res.text)  # 章节URL
    for index, value in enumerate(chapter_name):
        chapter_dict[value] = chapter_url[index]
    return chapter_dict


async def fetch(chapter_dict, book_path):

    async with aiohttp.ClientSession() as session:
        tasks = [asyncio.create_task(download_book(session, name, url, book_path)) for name, url in chapter_dict.items()]
        await asyncio.wait(tasks)


async def download_book(session, name, url, book_path):

    url = f"https://www.17k.com/chapter{url}"
    async with session.get(url, headers=headers) as res:
        res.encoding = 'utf-8'
        data = await res.text()
        paragraphs = re.findall(r'<div class="p">(.*?)</div>', data, re.DOTALL)     # 内容
        file = os.path.join(book_path, name)
        with open(f"{file}.txt", mode="w", encoding="utf-8") as fp:
            fp.write(re.findall('<h1>(.*?)</h1>', data)[0])         # 章节名称
            fp.write("\n".join(re.findall('<p>(.*?)</p>', paragraphs[0].strip())))
            fp.close()


# 启动函数
def run():
    lord_file()
    # 1.登录
    login()
    # 2.在书籍分类页选择已完成、全部免费得数据,并返回这些书籍得Name和URL,返回的结果字典:为键值对
    data = screen()
    # 3.循环遍历书籍Name和URL的字典
    for book_name, book_url in data.items():
        # 4. 判断是否存书籍Name的文件夹,不存在则创建,返回文件夹路径
        book_path = book_file(book_name)
        # 5. 进入书籍简介页面,获取详情页面URL
        book_detail_url = get_book_id(book_url)
        # 6. 获取不到时,跳过此次循环
        if not book_detail_url:
            continue
        # 7. 获取书籍详情页的章节名称和章节URL地址
        chapter_dict = get_section_url(book_detail_url)
        # 8. 异步下载内容
        asyncio.run(fetch(chapter_dict, book_path))




if __name__ == '__main__':
    run()

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
### 回答1: Scrapy是一个Python框架,用于基于异步的网络爬虫。它可以使用户更轻松地从互联网上爬取有关特定主题的信息。例如,你可以使用Scrapy爬取网站的趋势、新闻、保健、交通等方面的信息。 为了爬取这个网站上的景区评论,需要使用Scrapy的基本结构。首先,需要定义一个名为Spider的类。这个类包含要抓取的页面的URL,以及URL中找到的每个评论的有关信息。 接下来,需要定义一个名为Item的类。这个类包含评论的信息,如评论文本、时间和评分等。Scrapy使用这个类来生成XML和JSON等格式的输出。 Scrapy还将数据存储在名为Items的容器中。在Spider类中,Scrapy使用容器来提取评论,将每个评论的有关信息存储到Item实例中。 一旦你已经设置了Spider和Item类,在Scrapy中运行爬虫时,Scrapy将使用这两个类来抓取、解析和存储景区评论网站中的信息。 ### 回答2: Scrapy是一个功能强大的Python网络爬虫框架,它可以让用户针对各种网站的数据进行自动抓取和处理。对于爬取景区景点评论信息而言,Scrapy是一个致力于网页数据挖掘的有力工具。 对于一个Scrapy爬虫,基本上都需要完成以下步骤: 1. 定义爬虫种子。即确定要爬取网站、需要抓取的数据,以及触发爬虫的条件等。 2. 编写网络爬虫。可以根据网站的结构和数据流动情况编写我们需要的数据爬取代码,包括用户评论、评论评分等信息的抓取。 3. 存储数据。将爬取到的数据进行处理,并存储到本地文件或数据库中,以便后续进一步分析。 Scrapy特别适用于大规模的数据爬取和处理,效率高,且可以处理多个线程。因此,如果要爬取某个大型景区的评论数据,相信Scrapy一定能够胜任这项任务。通过获取这些评论数据,我们可以对景区的服务质量进行评估和改善,也可以为其他游客提供更加详细和真实的景区评价。

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值