爬取小说

本篇博客仅作备忘。
研发过程中,重点是BeautifulSoup的使用,只要会用BeautifulSoup就能很轻松的实现下面的功能

1、将html文件中的内容解析成小说,并保存

# -*- coding: utf-8 -*-
import urllib3
from bs4 import BeautifulSoup
import requests
import re
class getnovel:
    def __init__(self, url, path):
        self.url = url
        self.path = path
    
    def doget(self):
        req = requests.get(self.url)
        soup = BeautifulSoup(req.text, 'lxml')
        str = soup.select("#content > p")[0].encode("ISO-8859-1")

        a = re.compile(r'\n|&nbsp|\xa0|\\xa0|\u3000|\\u3000|\\u0020|\u0020|\t|\r')
        clean_str = a.sub('', str)
        clean_str = clean_str.replace('<br>', '\n').replace('<br/>', '\n').replace('<p>', '').replace('</p>', '')

        title = soup.select('div[align="center"] > h2')[0].encode(encoding='ISO-8859-1').replace('<h2>', '').replace('</h2>', '').replace(' ', '') + '.txt'
        with open('log.txt', 'w') as f:
            f.write(clean_str)
        

        with open('log.txt', 'r') as fr:
            with open(self.path + title, 'w') as fw:
                article = ""
                for s in fr.readlines():
                    if s.find('\xb5\xda') >= 0 and s.find('\xd5\xc2')>=0:
                        article += "\n\n"
                        article += s
                        article += "\n"
                    elif s != "\n":
                        article += s.replace("\n", "")
                fw.write(article)

if __name__ == '__main__':
    url = 'http://www.二级域名.me/12/12126/972764.html'
    a = getnovel(url = url, path = '')
    a.doget()

运行此文件,将会把小说按照小说的名称以txt格式保存在本地。

2、爬取首页列表中显示的所有小说的url

# -*- coding: utf-8 -*-
import urllib3
from bs4 import BeautifulSoup
import requests
import re

class getUrlFromAllvisit:
    def __init__(self, url):
        self.url = url

    def doget(self):
        req = requests.get(self.url)
        soup = BeautifulSoup(req.text, 'lxml')
        lianjie = soup.select('div[id="alist"] > div[id="alistbox"] > div[class="info"] > div[class="title"] > h2 > a')
        
        resList = []
        for s in lianjie:
            resList.append(s['href'])
        
        return resList

if __name__ == '__main__':
    url = "http://www.二级域名.me/modules/article/toplist.php?sort=allvisit&page=1"
    test = getUrlFromAllvisit(url)
    resList = test.doget()
    for s in resList:
        print(s)

3、爬取指定小说的所有章节的url

# -*- coding: utf-8 -*-
import urllib3
from bs4 import BeautifulSoup
import requests
import re

class getUrlFromBookHTML:
    def __init__(self, url):
        self.url = url
    
    def doget(self):
        req = requests.get(self.url)
        soup = BeautifulSoup(req.text, 'lxml')
        lianjie = soup.select('div[class="con"] > p[class="ti"] > a')

        resList = []
        for s in lianjie:
            resList.append('http://www.二级域名.me' + s['href'])
        
        return resList

if __name__ == '__main__':
    url = "http://www.二级域名.me/book/6571.html"
    test = getUrlFromBookHTML(url)
    resList = test.doget()
    for s in resList:
        print s

4、测试

# -*- coding: utf-8 -*-
import urllib3
from bs4 import BeautifulSoup
import requests
import re
from getUrlFromAllvisit import getUrlFromAllvisit
from getUrlFromBookHTML import getUrlFromBookHTML
from getnovel import getnovel

url = "http://www.二级域名.me/modules/article/toplist.php?sort=allvisit&page=1"

resList1 = getUrlFromAllvisit(url).doget()
resList2 = []
for s in resList1:
    temp = getUrlFromBookHTML(s).doget()
    for ss in temp:
        resList2.append(ss)

for s in resList2:
    getnovel(s, 'novel/').doget(
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值