http://www.fushubb.com/bookall/chuanyue/86698.html
import re
import requests
from bs4 import BeautifulSoup #网页解析 获取数据
import random
import time
import socket
import http.client
def main():
data=getData()
write_data(data)
change()
def askURL(url):
header={
"User-Agent": 。。,
"Accept": 。。",
"Cookie": 。。"
}
timeout = random.choice(range(200, 300))
while True:
try:
requests.adapters.DEFAULT_RETRIES = 5 # 增加重连次数
s=requests.session()
s.keep_alive = False # 关闭多余连接
html = s.get(url, headers=header, timeout=timeout)
html.encoding = 'gbk'
break
except socket.timeout as e:
print('3:', e)
time.sleep(random.choice(range(8, 15)))
except socket.error as e:
print('4:', e)
time.sleep(random.choice(range(20, 30)))
except http.client.BadStatusLine as e:
print('5:', e)
time.sleep(random.choice(range(30, 80)))
except http.client.IncompleteRead as e:
print('6:', e)
time.sleep(random.choice(range(5, 15)))
#print(html.text)
return html
def getData():
txt=[]
for i in range(2,123):#第一页url不太对所以单独爬取
url='http://www.fushubb.com/bookall/chuanyue/86697_{}.html'.format(i)
html=askURL(url)
bs = BeautifulSoup(html.text, "html.parser")
data=[]
for item in bs.find_all('div'):
findLink = re.compile(r'<div>(.*?)</div>')
link = re.findall(findLink, str(item))
if len(link) != 0:
link = link[0].replace("\u3000","") # 去掉简介中标点符号
link = "".join(link.split())
#print(link)
data.append(link)
else:
continue
txt.append(data)
#print(txt)
return txt
def write_data(data):
txt = str(data).replace('[','').replace(']','').replace("'","") # data是前面运行出的数据,先将其转为字符串才能写入
with open('存放结果.txt', 'a', encoding='utf-8') as file_handle: # .txt可以不自己新建,代码会自动新建
file_handle.write(txt) # 写入
file_handle.close()
def change():#将‘第几章’的位置换行
keyword = re.compile(r'第[1-9]\d*章')
str = '\r\n'
with open('存放结果.txt', 'r', encoding='utf-8') as file:
content = file.read()
posts = re.finditer(keyword, content)
for post in posts:
#print(post.group())
if post.start() != -1:
content = content[:post.start()]+str+post.group()+str+content[post.end():]
file = open(r'上.txt','w')
file.write(content)
file.close()
if __name__=='__main__':
main()
参考:
1、askURL中增加重连次数以及关闭连接
python 关于Max retries exceeded with url 的错误
2、change中在‘第几章’的位置换行
在文件指定位置插入字符串
在a文件的keyword之后插入字符串str
file = open(‘a’,'r')
content = file.read()
post = content.find(keyword)
if post != -1:
content = content[:post+len(keyword)]+str+content[post+len(keyword):]
file = open(‘a’,'w')
file.write(content)
file.close()
这其中的content[:post]读取的是keyword之前的内容,content[post:]读取的是包括keyword在内的之后的内容。所以要在keyword之后插入str需是用content[:post+len(keyword)]与content[post+len(keyword):]