还是上次的网站,
哎嘿嘿嘿
上代码
import requests
import re
import os
class Update:
def __init__(self):
self.count1 = 0
self.count2 = 0
self.head = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/41.0.2272.118 Safari/537.36'}
def main(self): # 获取文章标题和内容,存放到self.lst()里
for i in range(1, 6):
url = 'https://www.kanunu8.com/files/little/6-' + str(i) + '.html'
html = requests.get(url, headers=self.head).content.decode('gbk')
# print(html)
pattern1 = re.compile(r"<td width='98%'> <a href='(.*?)' target='_blank'>(.*?)</a></td>", re.S)
t1 = pattern1.findall(html) # 获取所有的小说的链接
for ur in t1:
try:
t = Update.getText(self, ur)
print('\t第%d页下载完成。。' % (t1.index(ur) + 1))
Update.writeTo(ur[1], t)
print('\t 第%d页写入完成。。' % (t1.index(ur) + 1))
self.count1 += 1
except:
print('第%d出错。。。跳过本页。。' % (t1.index(ur) + 1))
self.count2 += 1
continue
print('第%d页下载完成,开始下载第%d页' % (i, i + 1))
print('全部短篇下载完成!-*-\n\t成功%d篇\n\t失败%d篇' % (self.count1, self.count2))
def getText(self, url0): # 通过主页获得的链接来获得每一页的小说内容
url = 'https://www.kanunu8.com' + url0[0]
html = requests.get(url, headers=self.head).content.decode('gbk')
pattern = re.compile(r'<td height="20" valign="top">(.*?)</td>', re.S)
t1 = pattern.findall(html)
t1 = re.sub(r'<.*?>', '', t1[0])
t1 = re.sub(r'&\w*?;', '', t1)
t1 = re.sub(r'\s\s\s+', ',', t1)
return t1
@staticmethod
def createPath(pathname):
ifexists = os.path.exists(pathname)
if not ifexists:
os.makedirs(pathname)
os.chdir(pathname)
@staticmethod
def writeTo(title, content):
with open(title + '-爬取.txt', 'w') as f:
f.write(content)
f.close()
if __name__ == '__main__':
u = Update()
u.createPath('短篇小说集')
u.main()
哎嘿嘿嘿!
觉得还可的点个赞再走嘞