爬取小说

最新推荐文章于 2022-09-09 08:57:16 发布

m1f2c3

最新推荐文章于 2022-09-09 08:57:16 发布

阅读量614

点赞数

分类专栏：爬虫

本文链接：https://blog.csdn.net/m1f2c3/article/details/101949367

版权

爬虫专栏收录该内容

2 篇文章 0 订阅

订阅专栏

本篇博客仅作备忘。
研发过程中，重点是BeautifulSoup的使用，只要会用BeautifulSoup就能很轻松的实现下面的功能

1、将html文件中的内容解析成小说，并保存

# -*- coding: utf-8 -*-
import urllib3
from bs4 import BeautifulSoup
import requests
import re
class getnovel:
    def __init__(self, url, path):
        self.url = url
        self.path = path
    
    def doget(self):
        req = requests.get(self.url)
        soup = BeautifulSoup(req.text, 'lxml')
        str = soup.select("#content > p")[0].encode("ISO-8859-1")

        a = re.compile(r'\n|&nbsp|\xa0|\\xa0|\u3000|\\u3000|\\u0020|\u0020|\t|\r')
        clean_str = a.sub('', str)
        clean_str = clean_str.replace('<br>', '\n').replace('<br/>', '\n').replace('<p>', '').replace('</p>', '')

        title = soup.select('div[align="center"] > h2')[0].encode(encoding='ISO-8859-1').replace('<h2>', '').replace('</h2>', '').replace(' ', '') + '.txt'
        with open('log.txt', 'w') as f:
            f.write(clean_str)
        

        with open('log.txt', 'r') as fr:
            with open(self.path + title, 'w') as fw:
                article = ""
                for s in fr.readlines():
                    if s.find('\xb5\xda') >= 0 and s.find('\xd5\xc2')>=0:
                        article += "\n\n"
                        article += s
                        article += "\n"
                    elif s != "\n":
                        article += s.replace("\n", "")
                fw.write(article)

if __name__ == '__main__':
    url = 'http://www.二级域名.me/12/12126/972764.html'
    a = getnovel(url = url, path = '')
    a.doget()

运行此文件，将会把小说按照小说的名称以txt格式保存在本地。

2、爬取首页列表中显示的所有小说的url

# -*- coding: utf-8 -*-
import urllib3
from bs4 import BeautifulSoup
import requests
import re

class getUrlFromAllvisit:
    def __init__(self, url):
        self.url = url

    def doget(self):
        req = requests.get(self.url)
        soup = BeautifulSoup(req.text, 'lxml')
        lianjie = soup.select('div[id="alist"] > div[id="alistbox"] > div[class="info"] > div[class="title"] > h2 > a')
        
        resList = []
        for s in lianjie:
            resList.append(s['href'])
        
        return resList

if __name__ == '__main__':
    url = "http://www.二级域名.me/modules/article/toplist.php?sort=allvisit&page=1"
    test = getUrlFromAllvisit(url)
    resList = test.doget()
    for s in resList:
        print(s)

3、爬取指定小说的所有章节的url

# -*- coding: utf-8 -*-
import urllib3
from bs4 import BeautifulSoup
import requests
import re

class getUrlFromBookHTML:
    def __init__(self, url):
        self.url = url
    
    def doget(self):
        req = requests.get(self.url)
        soup = BeautifulSoup(req.text, 'lxml')
        lianjie = soup.select('div[class="con"] > p[class="ti"] > a')

        resList = []
        for s in lianjie:
            resList.append('http://www.二级域名.me' + s['href'])
        
        return resList

if __name__ == '__main__':
    url = "http://www.二级域名.me/book/6571.html"
    test = getUrlFromBookHTML(url)
    resList = test.doget()
    for s in resList:
        print s

4、测试

# -*- coding: utf-8 -*-
import urllib3
from bs4 import BeautifulSoup
import requests
import re
from getUrlFromAllvisit import getUrlFromAllvisit
from getUrlFromBookHTML import getUrlFromBookHTML
from getnovel import getnovel

url = "http://www.二级域名.me/modules/article/toplist.php?sort=allvisit&page=1"

resList1 = getUrlFromAllvisit(url).doget()
resList2 = []
for s in resList1:
    temp = getUrlFromBookHTML(s).doget()
    for ss in temp:
        resList2.append(ss)

for s in resList2:
    getnovel(s, 'novel/').doget(