爬取fushuwang小说txt

http://www.fushubb.com/bookall/chuanyue/86698.html

import re
import requests
from bs4 import BeautifulSoup       #网页解析 获取数据
import random
import time
import socket
import http.client

def main():
    data=getData()
    write_data(data)
    change()


def askURL(url):
    header={
        "User-Agent": 。。,
        "Accept": 。。",
        "Cookie": 。。"
    }
    timeout = random.choice(range(200, 300))

    while True:
        try:
            requests.adapters.DEFAULT_RETRIES = 5 # 增加重连次数
            s=requests.session()
            s.keep_alive = False # 关闭多余连接
            
            html = s.get(url, headers=header, timeout=timeout)
            html.encoding = 'gbk'

            break
        except socket.timeout as e:
            print('3:', e)
            time.sleep(random.choice(range(8, 15)))

        except socket.error as e:
            print('4:', e)
            time.sleep(random.choice(range(20, 30)))

        except http.client.BadStatusLine as e:
            print('5:', e)
            time.sleep(random.choice(range(30, 80)))

        except http.client.IncompleteRead as e:
            print('6:', e)
            time.sleep(random.choice(range(5, 15)))

    #print(html.text)
    return html


def getData():
    txt=[]
    for i in range(2,123):#第一页url不太对所以单独爬取
        url='http://www.fushubb.com/bookall/chuanyue/86697_{}.html'.format(i)
        html=askURL(url)
        bs = BeautifulSoup(html.text, "html.parser")
        data=[]
        for item in bs.find_all('div'):
            findLink = re.compile(r'<div>(.*?)</div>')
            link = re.findall(findLink, str(item))
            if len(link) != 0:
                link = link[0].replace("\u3000","")  # 去掉简介中标点符号
                link = "".join(link.split())
                #print(link)
                data.append(link)
            else:
                continue
        txt.append(data)
    #print(txt)
    return txt


def write_data(data):
    txt = str(data).replace('[','').replace(']','').replace("'","") # data是前面运行出的数据,先将其转为字符串才能写入
    with open('存放结果.txt', 'a', encoding='utf-8') as file_handle:  # .txt可以不自己新建,代码会自动新建
        file_handle.write(txt)  # 写入
        file_handle.close()

def change():#将‘第几章’的位置换行
    keyword = re.compile(r'第[1-9]\d*章')
    str = '\r\n'

    with open('存放结果.txt', 'r', encoding='utf-8') as file:
        content = file.read()
        posts = re.finditer(keyword, content)
        for post in posts:
            #print(post.group())
            if post.start() != -1:
                content = content[:post.start()]+str+post.group()+str+content[post.end():]
                file = open(r'上.txt','w')
                file.write(content)
                file.close()


if __name__=='__main__':
    main()

参考:
1、askURL中增加重连次数以及关闭连接
python 关于Max retries exceeded with url 的错误

2、change中在‘第几章’的位置换行
在文件指定位置插入字符串
在a文件的keyword之后插入字符串str

 file = open(‘a’,'r')
 content = file.read()
 post = content.find(keyword)
 if post != -1:
     content = content[:post+len(keyword)]+str+content[post+len(keyword):]
     file = open(‘a’,'w')
     file.write(content)
 file.close()

这其中的content[:post]读取的是keyword之前的内容,content[post:]读取的是包括keyword在内的之后的内容。所以要在keyword之后插入str需是用content[:post+len(keyword)]与content[post+len(keyword):]

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值