起因:有一天突然想看本小说,在小说网站上不能下载,广告太多,便。。。
思路:
–分析网站的结构
–爬取目录(获得章节名和链接)
–多线程加载章节网页
–正则匹配内容
–保存生成错误日志
–链接单个章节txt合并为一个
# -*- coding: utf-8 -*-
"""
Created on Tue Jul 2 18:23:49 2019
@author: 24709
"""
import urllib
import urllib.request
import multiprocessing
from bs4 import BeautifulSoup
import re
import os
import time
#小说首页http://m.wenxuedu.com/html/208249
dirPath = "C:/Users/24709/Desktop/txtxt/"
#所有txt位于的文件夹路径(注意\与/的区别),最终文件为output
soup=""
titles=[] #存取所有章节名称
urls=[] #存取所有章节的URL链接
################从目录页面爬取章节名称和url地址##################################
def geturl():
print('正在加载章节.....')
for index in range(5,6):#(5,151)爬取第5到第150页目录
try:
request = urllib.request.Request("http://m.wenxuedu.com/html/208249_"+str(index))
response = urllib.request.urlopen(request,timeout=8)
content = response.read()
data = content.decode('utf-8')
# soup转换
soup = BeautifulSoup(data, "html.parser")
i=0;
for link in soup.findAll('li'):
#获取 link 的 href 属性内容
if re.search(r'章',str(link.a.string)) :
if i>4:
print(str(link.a.string))
#print(link.a.get('href'))
titles.append(str(link.a.string))
urls.append(str(link.a.get('href')))
i=i+1
except:
try:
#再次尝试
request = urllib.request.Request("http://m.wenxuedu.com/html/208249_"+str(index))
response = urllib.request.urlopen(request,timeout=8)
content = response.read()
data = content.decode('utf-8')
# soup转换
soup = BeautifulSoup(data, "html.parser")
i=0;
for link in soup.findAll('li'):
#获取 link 的 href 属性内容
if re.search(r'章',str(link.a.string)) :
if i>4:
print(str(link.a.string))
#print(link.a.get('href'))
titles.append(str(link.a.string))
urls.append(str(link.a.get('href')))
i=i+1
except:
#目录下载失败将索引写入错误日志
writefile('error_log',"index:{} \n".format(index))
###########################根据url下载小说内容##################################
def getcontent(url):
request = urllib.request.Request("http://m.wenxuedu.com"+url)
response = urllib.request.urlopen(request)
content = response.read()
data = content.decode('utf-8')
# soup转换
soup = BeautifulSoup(data, "html.parser")
a=str(soup.find(id='novelcontent').p).replace('<br/>','')[3:-4]
try:
#尝试删去冗余的头(标题)和尾
TEMP=int(re.search(str(r"[)]"),a).span()[0])+1
a=a[TEMP:-1*(re.search(str('\u3000\u3000'),a[::-1]).span()[0]+2)]
except:
pass
#print(a)
#print("\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\")
try:
#尝试由没有第二章
request = urllib.request.Request("http://m.wenxuedu.com"+url[:-1]+"_2")
response = urllib.request.urlopen(request)
content = response.read()
data = content.decode('utf-8')
# soup转换
soup = BeautifulSoup(data, "html.parser")
b=str(soup.find(id='novelcontent').p).replace('<br/>','')[3:-4]
#print(b)
try:
#尝试删去冗余的头(标题)
TEMP=int(re.search(str("[)]"),b).span()[0])+1
c=a+b[TEMP:-1]
except:
c=a+b
except:
#print("本章无第二章节")
pass
return c
############################写入文件###########################################
def writefile(title,content):
with open(dirPath+title+".txt",'a',encoding='utf-8') as f:
f.write(content)
f.close()
#######################尝试下载,下载失败保存到日志##############################
def download(title_url):
try:
writefile(title_url[0],getcontent(title_url[1]))
except:
writefile('error_log',"title:{} url:{} \n".format(title_url[0],title_url[1]))
##########################合并txt##############################################
def linkTheBook():
print("-------------------开始合成txt-------------------")
start0 = time.time()
file0 = os.listdir(dirPath)
files=[]
for file in file0:
if re.search(r'(\d+)',file):
files.append(file)
##过滤名字里不带数字章节的
files.sort(key=lambda i:int(re.search(r'(\d+)',i)[0]))#用正则提取章节数字并排序
res = ""
i = 0
for file in files:
if file.endswith(".txt"):
i += 1
title = "%s" % (file[0:len(file)-4])
with open(dirPath + file, "r", encoding='utf-8') as file:
content = file.read()
file.close()
append = "\n%s\n\n%s" % (title, content)
res += append
with open(dirPath+"outfile.txt", "w", encoding='utf-8') as outFile:
outFile.write(res)
outFile.close()
end0=time.time()
print("-------------------txt合成完成-------------------")
print("全书共"+str(len(files))+"章,共"+str(len(res))+"字")
print('运行时间 %0.2f s.' % (end0 - start0))
#######################################################################3
if __name__=="__main__":
start = time.time()
geturl()
#爬取目录中的章节名称和url地址到【titles】和【urls】
print("-------------------开始下载-------------------")
p = []
print('主程序的PID:%s' % os.getpid())
for [title,url] in zip(titles,urls):
p.append(multiprocessing.Process(target=download, args=([title,url],)))
#多进程同时下载不同的章节
print("等待所有的进程加载完成........")
for i in p:
i.start()
for i in p:
i.join()
end = time.time()
print("-------------------全部下载完成-------------------")
print('运行时间 %0.2f s.' % (end - start))
###################################3
#linkTheBook()
pass
用于自我学习记录,欢迎交流指正。