import urllib.request
import re
import os
import time
mulu='https://www.9dxs.com/2/2308/index.html'
response = urllib.request.urlopen(mulu)
html=response.read().decode('gbk')
def get_zhang(lianjie,biaoti):
zhang ='https://www.9dxs.com/2/2308/'+lianjie
response = urllib.request.urlopen(zhang)
html=response.read().decode('gbk')
pattern=re.compile(u'(<div id="content" class="content">)(.*?)(</div>.*<div class="chapterpage">)',re.S)
zhang_html=pattern.findall(html)
for zhengwen in zhang_html:
text=re.sub( '<.*?>', '', zhengwen[1])
text=re.sub( ' ', ' ', text)
return text
def baocun(biaoti,zhengwen):
fo = open('帝临鸿蒙.txt', "a+")
fo.write('\r\n' + biaoti + '\r\n'+zhengwen)
fo.close()
pattern=re.compile(u'<li><a href="(.*?)">(.*?)</a></li>')
mulu=pattern.findall(html)
for zhang in mulu:
biaoti=zhang[1]
zhengwen=get_zhang(zhang[0],zhang[1])
baocun(biaoti,zhengwen)
time.sleep(5)