找到小说的目录地址
url='https://www.8dkan.com/daoshibuhaoruo/'
根据小说目录页源码格式,以及每一章正文的源码,写出相应的正则表达式
list_partter=re.compile(r'<li><a href="(.*?)">(.*?)</a></li>') #提取目录也每一章的标题以及正文跳转url地址
data_partter=re.compile(r'<div class="box_box">(.*?)</div>') #获取正文内容
使用fiddler得到浏览器请求头来模拟浏览器请求,再使用urllib来获取小说内容
import urllib.request
import urllib.parse
import urllib.error
import re
import time
def getData(url,n):
try:
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:61.0) Gecko/20100101 Firefox/61.0'}
request=urllib.request.Request(url=url,headers=headers)
response=urllib.request.urlopen(request,timeout=5)
data=response.read().decode('gbk')
return data
except Exception as err:
if n<=3:
time.sleep(2) #频繁请求可能导致请求失败,延迟几秒钟再次请求
n=n+1
getData(url,n)
else:
pass
txt= open(r'G:/download/2.txt','a+')
for chap in lists:
try:
data=getData(url+chap[0],1)
info = re.search(data_partter,data,re.I|re.S)
str=info.group(0).replace('<br /><br />','\n').replace('<br />','\n')
txt.write(chap[1])
txt.write(str+'\n\n')
print('%s'% chap[1])
except Exception as err:
continue
txt.close()