Python 小笔记 (5) 爬虫起步(小说网站)

最新推荐文章于 2024-05-02 22:20:51 发布

c_xiaogishou

最新推荐文章于 2024-05-02 22:20:51 发布

阅读量368

点赞数 1

分类专栏： python 文章标签： python

本文链接：https://blog.csdn.net/c_xiaogishou/article/details/106752334

版权

python 专栏收录该内容

6 篇文章

订阅专栏

爬取小说网页步骤

**
——爬取小说并写入txt文件

1.使用到库文件

requests：用于get请求
bs4：网页解析
re：正则表达式

2,对网页文件结构进行分析（谷歌浏览器）
       根据红框内网址http://biquge.com.cn/book/8012/,可以推出每本书都有唯一ID，例如这里的8012，可通过修改网址中ID部分到达不同小说主页；
在这里插入图片描述
3.获取网页的请求头文件
       打开谷歌的开发者工具，选择Network，点击Name框下列表的第一个选项，如果没有对应的列表信息，刷新一下网页即可；

       读取其中Request Headers选项为字典；
在这里插入图片描述
4.分析每章小说网页结构
从开发者工具中找到你想要的网页元素，这里我们主要需要：

bookname：可获取章节名称
content：可获取章节内容
bottom2：可获取下一章节地址

在这里插入图片描述
5.具体代码

#coding:utf-8
import  requests
import threading
from bs4 import BeautifulSoup
import re
import os
import time
import sys

req_header={ #获取网页的请求头文件
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Accept-Encoding':'gzip, deflate, br',
'Accept-Language':'en,zh;q=0.9,zh-CN;q=0.8',
'Cache-Control':'max-age=0',
'Connection':'keep-alive',
'Cookie':'UM_distinctid=172ae87b8a378-08226b3f44fef7-f7d123e-144000-172ae87b8a43a9; CNZZDATA1264388021=1696797693-1592062375-https%253A%252F%252Fwww.google.com.hk%252F%7C1592062375; Hm_lvt_79146f7516f35fe12fd594789a89d25d=1592066030; Hm_lpvt_79146f7516f35fe12fd594789a89d25d=1592066084',
'Host':'www.biquge.com.cn',
'Referer':'https://www.biquge.com.cn/book/',
'Sec-Fetch_Dest':'document',
'Sec-Fetch_Mode':'navigate',
'Sec-Fetch_Site':'same-origin',
'Sec-Fetch_User':'?1',
'Upgrade-Insecure-Requests':'1',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36'
}


req_url_base='https://www.biquge.com.cn/book/'           #小说主地址
# req_url=req_url_base+'8012/'
# txt_section='320525.html'
#小说下载函数
#txt_id：小说编号
#txt字典项介绍
#id：小说编号
# title：小说题目
# first_page：第一章页面
# txt_section：章节地址
# section_name：章节名称
# section_text：章节正文
# section_ct：章节页数
def get_txt(txt_id):
    txt={}
    txt['title']=''
    txt['id']=str(txt_id)

    #print("请输入需要下载的小说编号：")
    #txt['id']=input()
    req_url=req_url_base+ txt['id']+'/'                        #根据小说编号获取小说URL
    print("小说编号："+txt['id'])
    res=requests.get(req_url,params=req_header)             #获取小说目录界面
    soups=BeautifulSoup(res.text,"html.parser")           #soup转化
    #获取小说题目
    txt['title']=soups.select('#wrapper .box_con #maininfo #info h1')[0].text
    txt['author']=soups.select('#wrapper .box_con #maininfo #info p')
    #获取小说作者
    txt['author']=txt['author'][0].text
    #获取小说简介
    txt['intro']=soups.select('#wrapper .box_con #maininfo #intro')[0].text.strip()
    print("编号："+'{0:0>8}   '.format(txt['id'])+  "\n小说名：《"+txt['title']+"》"+ "\n开始下载。")
    print("正在获取所有章节地址。。。")
    #获取小说所有章节信息
    all_page_address=soups.select('#wrapper .box_con #list dl dd a')
    #print("所有章节链接")
    # 获取小说总章页面数
    section_ct=len(all_page_address)
    print("小说章节数：" + str(section_ct))
    txt['links'], txt['names'] = [], []
    for x in all_page_address:
        txt['links'].append(x['href'].replace('/book/8012/', ''))
        txt['names'].append(x.string)
    """
    for i in range(section_ct):
        print(txt['links'][i]+":"+txt['names'][i])
    """
    #将每一章内容写入txt文件
    with open('大主宰.txt', 'w', encoding="utf-8") as file_object:
        file_object.write("小说编号："+txt['id']+'\n')
        file_object.write("小说题目:"+txt['title']+'\n')
        file_object.write(txt['author']+'\n')
        file_object.write("小说简介:\n"+txt['intro']+'\n')
        for i in range(section_ct):
            r = requests.get(req_url + txt['links'][i], params=req_header)
            # soup转换
            soup = BeautifulSoup(r.text, "html.parser")
            # 获取章节名称
            section_name = soup.select('#wrapper .content_read .box_con .bookname h1')[0].text
            # 获取章节文本
            section_text = soup.select('#wrapper .content_read .box_con #content')[0]
            for ss in section_text.select("script"):  # 删除无用项
                ss.decompose()
            # 获取章节文本
            section_text = re.sub('\s+', '\r\n\t', section_text.text).strip('\r\n')
            file_object.write(section_name+'\n')
            file_object.write(section_text+'\n\n')
            print(section_name+"     下载完毕")

    print("全书下载完毕")

#调用函数，填入小说编号
get_txt(8012)