突然想看《仙剑问情》,网上又下载不到完整的txt版,看到“爱上中文”网有分章节在线版,于是有了写个爬虫下载小说的想法具体代码如下:
import urllib.request
import re as re
txt=''
#下载页面
def getSrc(url):
html_src = urllib.request.urlopen(url).read().decode('GBK')
return html_src
#找到页面中的url
def findUrls(html):
splitM=html.split('\r\n')
for line in splitM:
if '正文' in line:
UrlList=re.findall("(\d{8}.html)",line)
return UrlList
#获取页面正文,也就是小说正文
def getContent(html):
splitS=html.split('\r\n')
for line in splitS:
if 'title' in line:
title=line.replace('title>仙剑问情_正文 ','').replace('</title>','').replace('_爱上中文','')
print('now is process at ' +title)
if 'contents' in line:
txt='\r\n'+title+'\r\n'+line.replace(" ","").replace("<br />","").replace("<div id=\"contents\">","").replace("</div>","")+'\r\n'
break
return txt
#程序入口
mainUrl='http://www.aszw.com/book/50/50517/'
urlList=findUrls(getSrc(mainUrl))
for url in urlList:
novelUrl=mainUrl+url
txt+=getContent(getSrc(novelUrl))
#print(txt)
wfile=open('F:\仙剑问情.txt','w')
wfile.write(txt)