用正则表达式爬取小说盗墓笔记

# -*- coding: utf-8 -*-
import requests, os, re, codecs
#下载网页信息
def getHTMLText(url):
    try:
        r = requests.get(url, timeout=30)
        r.raise_for_status()
        r.encoding = r.apparent_encoding
        return r.text
    except:
        return ""
#获取全部章节网址
def getURLList(html):
    try:
        list = re.findall('(<li><a href=")(.*?)(" title=)', html)
        url_list = []
        for i in range(len(list)):
            url_list.append(list[i][1])
        return url_list
    except:
        return ""
#解析网页,查找关键信息
def getContents(html):
    try:
        chapter_name = re.search('(盗墓笔记_)(.*?)(_全书网)', html)
        chapter_contents = re.findall('(&nbsp;&nbsp;&nbsp;&nbsp;)(.*?)(<br />)', html)
        contents_ = [chapter_name.group(2)]
        for i in range(len(chapter_contents)):
            contents_.append(chapter_contents[i][1])
        return contents_
    except:
        return ""
#主体函数,调用功能函数,并下载小说内容
def main():
    url = 'http://www.quanshuwang.com/book/9/9055'
    html = getHTMLText(url)
    urlList = getURLList(html)
    count = 0
    for i in urlList:
        print('\r当前进度:{:.2f}%'.format(count*100/len(urlList)), end = '')
        html = getHTMLText(i)
        contents_ = getContents(html)
        fo = codecs.open(r'C:\Users\xxx\Desktop\盗墓笔记.txt', 'a+', 'utf-8')
        for j in contents_:
            fo.write('\r\n' + j + '\r\n')
        fo.close()
        count = count + 1
#调用主函数,运行程序
main()
评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值