网络爬虫抓取小说并整理为可读文件

9 篇文章 0 订阅
3 篇文章 0 订阅

现在爬小说越来越难了,受到很多限制,格式也有很多变化,这里是一些用过的旧代码,可以改改再继续用。

# 爬取笔趣阁小说
from urllib.request import urlopen
from requests.exceptions import RequestException
import re
from requests import get
import time
import requests

from lxml import etree

requests.adapters.DEFAULT_RETRIES = 5 # 增加重连次数
s = requests.session()
s.keep_alive = False # 关闭多余连接

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36'
}

params = {
    'enc':'utf-8'
}

urlfirst ="https://www.biqugexx.com/0_427/" #目录页  格式 <p><a href="/139/139648/48828294.html">第1章 重返地球</a></p>
# response = get(urlfirst)
response = requests.get(url=urlfirst,params=params,headers=headers)

# 打印出所请求页面返回的编码方式
# print(response.encoding)
# response.apparent_encoding是通过内容分析出的编码方式,这里是urf-8
# print(response.apparent_encoding)
# 转码
content = response.text.encode(response.encoding).decode(response.apparent_encoding)
# print(content)

# print(response.text)
html = etree.HTML(response.text) #etree.HTML():构造了一个XPath解析对象并对HTML文本进行自动修正
k = ""

# s=" <dd><a href='/28/28056/22534186.html' >第一百二十八章 阳明公也是</a></dd>"
# p = "<dd><a href=\'/28/28056/[0-9]+.html\' >第"
# r=re.findall("<dd><a href=\'/28/28056/[0-9]+.html\' >第",s)

# pattern = "<dt>(.*?)</dt>"
# result = re.findall(pattern,content)
# print(result)
# exit()
# content = result

# pattern = "<dd><a href=\'(/28/28056/[0-9]+).html\' >(第[\u4e00-\u9fa5]+章 [\u4e00-\u9fa5]+)</a></dd>"
pattern = "<dd> <a style=\"\" href=\"/0_427/30909437.html\">第[0-9]+章</a></dd>"
# /28/28056/21593229.html' >第六十一章 死道友不死贫道
result = re.findall(pattern,content)  # 
# print(result)
for pp in result:

    time.sleep(3)  # 太频繁会被封IP

    k += "\n"+pp[1]+"\n"  # 标题
    print(pp[1])
    chapter = "https://www.biqugexx.com"+pp[0]+".html"
    print(chapter)
    try:    
        response = requests.get(url=chapter,params=params,headers=headers)
        content = response.text.encode(response.encoding).decode(response.apparent_encoding)
        pattern = "<div id=\"content\">&nbsp;&nbsp;&nbsp;&nbsp;(.*?)&nbsp;&nbsp;&nbsp;&nbsp;ps."
        result = re.findall(pattern,content)  # 获得每章数据 ;
        print(result)  # 测试输出内容

        for r in result:
            r = r.replace("&nbsp;"," ")
            r = r.replace("<br/>","\n")
            k += r
    except RequestException as e:
        print(e)

with open("biquge.txt","w+",encoding="utf8")as f:
    f.write(k)


# 爬取笔趣阁小说
from urllib.request import urlopen
from requests.exceptions import RequestException
import re
from requests import get
import time
import requests

from lxml import etree

requests.adapters.DEFAULT_RETRIES = 5 # 增加重连次数
s = requests.session()
s.keep_alive = False # 关闭多余连接

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36'
}

params = {
    'enc':'utf-8'
}

urlfirst ="https://www.bokan.cc/12/12620/" #目录页  格式 <p><a href="/139/139648/48828294.html">第1章 重返地球</a></p>
# response = get(urlfirst)
response = requests.get(url=urlfirst,params=params,headers=headers)

# 打印出所请求页面返回的编码方式
# print(response.encoding)
# response.apparent_encoding是通过内容分析出的编码方式,这里是urf-8
# print(response.apparent_encoding)
# 转码
content = response.text.encode(response.encoding).decode(response.apparent_encoding)
# print(content)

# print(response.text)
html = etree.HTML(response.text) #etree.HTML():构造了一个XPath解析对象并对HTML文本进行自动修正
k = ""

# s=" <dd><a href='/28/28056/22534186.html' >第一百二十八章 阳明公也是</a></dd>"
# p = "<dd><a href=\'/28/28056/[0-9]+.html\' >第"
# r=re.findall("<dd><a href=\'/28/28056/[0-9]+.html\' >第",s)

# pattern = "<dt>(.*?)</dt>"
# result = re.findall(pattern,content)
# print(result)
# exit()
# content = result

# pattern = "<dd><a href=\'(/28/28056/[0-9]+).html\' >(第[\u4e00-\u9fa5]+章 [\u4e00-\u9fa5]+)</a></dd>"
pattern = "<li><a href=\"/12/12620/10078180.html\">第[0-9]+章</a></li>"
# /28/28056/21593229.html' >第六十一章 死道友不死贫道
result = re.findall(pattern,content)  # 
# print(result)
for pp in result:

    time.sleep(3)  # 太频繁会被封IP

    k += "\n"+pp[1]+"\n"  # 标题
    print(pp[1])
    chapter = "https://www.bokan.cc"+pp[0]+".html"
    print(chapter)
    try:    
        response = requests.get(url=chapter,params=params,headers=headers)
        content = response.text.encode(response.encoding).decode(response.apparent_encoding)
        pattern = "&nbsp;&nbsp;&nbsp;&nbsp;第(.*?)<div id=\"gt1\"></div>"
        result = re.findall(pattern,content)  # 获得每章数据 ;
        print(result)  # 测试输出内容

        for r in result:
            r = r.replace("&nbsp;"," ")
            r = r.replace("<br/>","\n")
            k += r
    except RequestException as e:
        print(e)

with open("biquge.txt","w+",encoding="utf8")as f:
    f.write(k)


import requests
import re
import time

s = requests.Session()
url = 'https://www.xxbiquge.com/2_2634/'
url = 'https://www.xsbiquge.com/91_91879/'
html = s.get(url)
html.encoding = 'utf-8'

# 获取章节
caption_title_1 = re.findall(r'<a href="(/91_91879/.*?\.html)".*?</a>',html.text)
caption_title_2 = re.findall(r'<a href="/91_91879/.*?\.html".*?第([0-9]+)章.*?</a>',html.text) #<a href="/91_91879/640499.html">第1章</a>
print(len(caption_title_1),len(caption_title_2))
# 写文件
path = r'title.txt'     # 这是我存放的位置,你可以进行更改
file_name = open(path,'a',encoding='utf-8')

k = 0
# 循环下载每一章
for i in caption_title_1:
    nn = int(caption_title_2[k])
    k = k+1
    print(nn)
    if(nn>1000):
        time.sleep(1)
        caption_title_1 = 'https://www.xxbiquge.com'+i
        # 网页源代码
        s1 = requests.Session()
        r1 = s1.get(caption_title_1)
        r1.encoding = 'utf-8'

        # 获取章节名
        name = re.findall(r'<meta name="keywords" content="(.*?)" />',r1.text)[0]
        print(name)
        # cc = re.findall(r'第([0-9]+)章',name)
        # nn = int(cc[0])
        # if(nn>900):

        file_name.write(name)
        file_name.write('\n')

        # 获取章节内容
        chapters = re.findall(r'<div id="content">(.*?)</div>',r1.text,re.S)[0]
        chapters = chapters.replace(' ', '')
        chapters = chapters.replace('readx();', '')
        chapters = chapters.replace('& lt;!--go - - & gt;', '')
        chapters = chapters.replace('&lt;!--go--&gt;', '')
        chapters = chapters.replace('()', '')
        # 转换字符串
        s = str(chapters)
        s_replace = s.replace('<br/>',"\n")
        while True:
            index_begin = s_replace.find("<")
            index_end = s_replace.find(">",index_begin+1)
            if index_begin == -1:
                break
            s_replace = s_replace.replace(s_replace[index_begin:index_end+1],"")
        pattern = re.compile(r'&nbsp;',re.I)
        fiction = pattern.sub(' ',s_replace)
        file_name.write(fiction)
        file_name.write('\n')

file_name.close()

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

二粒米

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值