# -*- coding: utf-8 -*- import requests, os, re, codecs #下载网页信息 def getHTMLText(url): try: r = requests.get(url, timeout=30) r.raise_for_status() r.encoding = r.apparent_encoding return r.text except: return "" #获取全部章节网址 def getURLList(html): try: list = re.findall('(<li><a href=")(.*?)(" title=)', html) url_list = [] for i in range(len(list)): url_list.append(list[i][1]) return url_list except: return "" #解析网页,查找关键信息 def getContents(html): try: chapter_name = re.search('(盗墓笔记_)(.*?)(_全书网)', html) chapter_contents = re.findall('( )(.*?)(<br />)', html) contents_ = [chapter_name.group(2)] for i in range(len(chapter_contents)): contents_.append(chapter_contents[i][1]) return contents_ except: return "" #主体函数,调用功能函数,并下载小说内容 def main(): url = 'http://www.quanshuwang.com/book/9/9055' html = getHTMLText(url) urlList = getURLList(html) count = 0 for i in urlList: print('\r当前进度:{:.2f}%'.format(count*100/len(urlList)), end = '') html = getHTMLText(i) contents_ = getContents(html) fo = codecs.open(r'C:\Users\xxx\Desktop\盗墓笔记.txt', 'a+', 'utf-8') for j in contents_: fo.write('\r\n' + j + '\r\n') fo.close() count = count + 1 #调用主函数,运行程序 main()
用正则表达式爬取小说盗墓笔记
于 2018-03-11 14:36:30 首次发布