Don't repeat yourself温故而知新

import requests, re
from urllib.parse import urljoin
# 写文件
def fileWrite(title, message, name):
    with open(name + ".txt", "a+") as f:
        f.write(title + "\n" + message + "\n")
        print(title, "下载成功!")
# 获取页面信息
def getWebPage(url):
    headers = {"User-Agent": "Mozilla/5.0 (iPad; CPU OS 11_0 like Mac OS X) "
                             "AppleWebKit/604.1.34 (KHTML, like Gecko) Versio"
                             "n/11.0 Mobile/15A5341f Safari/604.1", }
    try:
        res = requests.get(url, headers)
        res.encoding = res.apparent_encoding
        return res.text
    except:
        print("页面请求出错!")
# 提取链接
def getLink(page, url, paternName, paternLink):
    try:
        links = re.findall(paternLink, page, re.M|re.S|re.I)
        # 相对路径转绝对路径
        links = iter([urljoin(url, link) for link in links])
        name = re.findall(paternName, page)[0]
        return name, links
    except:
        print("提取页面信息错误!")
# 获取内容
def getContent(name, link, paternTitle, paternBody):
    page = getWebPage(link)
    try:
        title = re.findall(paternTitle, page)[0]
        content = re.findall(paternBody, page)[0]
        content = content.replace("&nbsp;", " ").replace("<br />", "\n")
        fileWrite(title, content, name)
    except:
        print("提取小说页面错误!")

# main
def main(url, paternName, paternLink, paternTitle, paternBody):
    page = getWebPage(url)
    name, links = getLink(page, url, paternName, paternLink)
    for link in links:
        getContent(name, link, paternTitle, paternBody)

    print("小说下载完毕!")
if __name__ == '__main__':
    paternName, paterLink, paternTitle, paternBody = "<h1>(.*?)</h1>", \
                                                     "<dd><a href='(.*?)' >.*?</a></dd>", \
                                                     "<h1>(.*?)</h1>", \
                                                     '<div id="content">(.*?)<p>.*?</p></div>'
    main("http://www.xbiquge.la/2/2208/", paternName, paterLink, paternTitle, paternBody)
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值