评书下载器

最新推荐文章于 2020-04-14 23:30:27 发布

robin_chenyu

最新推荐文章于 2020-04-14 23:30:27 发布

阅读量1.3k

点赞数

分类专栏： python 文章标签： python Python

本文链接：https://blog.csdn.net/robinchenyu/article/details/8571443

版权

python 专栏收录该内容

26 篇文章 0 订阅

订阅专栏

# -*- coding: cp936 -*-
try:
    import urllib.request as urllib2
except ImportError, details:
    import urllib2
import urllib
import re
import os
import sys

main_url = 'http://www.5ips.net'

def getPage(parurl):

    find_re = re.compile(r'<a href="(http://p18d.5ips.net/pingshu/([^?]+)?[^"]+)')
    page=urllib2.urlopen(parurl).read()
    arr = []
    for x in find_re.findall(page):
        arr.append(x)
        print "out: ", x[0]
        print "out2: ", x[1].split('/')[-1]

    if len(arr) == 1:
        down_mp3( arr[0][0].decode('gbk').encode('gb2312'), x[1].split('/')[-1])

    return page

def getNextPage(page):
    find_re = re.compile(r'A href="(down[^"]+htm)"')
    arr = []
    for x in find_re.findall(page):
        nxturl = '%s/%s' % (main_url,x)
        arr.append(nxturl)
        print "next: ", nxturl
        sys.stdout.flush()

    if len(arr) == 1:
        getNextPage(getPage(arr[0]))

def down_mp3(mp3_url, filename):
    try:
        open(filename, 'wb').write(urllib2.urlopen(mp3_url).read())
        print 'Downloaded'
    except details:
        print mp3_url,' not downloaded', details

def savePage(filename, purl, tt):
    """ read html contents from <purl>, and save as <filename>, the header in
    save to the file is <tt>
    """
    content=urllib2.urlopen(purl).read()
    find_cnt=re.compile(r'id="chapterContent">(.+)<p class="recent_read"', re.S)
    #print(content.decode(encoding='utf-8'))

    with open(filename,mode='wb') as f:
        # for x in find_cnt.findall(content.decode()):
        # for x in content.decode(encoding='utf-8'):
            # print(x)
        f.write(tt)
        f.write(content)
    f.closed

def delBr(filename):
    """ replace <br> with \n in files
    """
    with open(filename, mode='r', encoding='utf-8') as f:
        x = f.read()
        y = re.sub(r'<br>', r'\n', x)
        with open(filename+'_t', mode='w', encoding='utf-8') as ff:
            ff.write(y)

        ff.closed
    f.closed

# url='http://www.17k.com/list/90206.html'
# main_pager='http://www.17k.com%s'

# find_re = re.compile(r'<a title="([^;]+);[^h]+href="([^"]+)"|<h2>(.+)<', re.UNICODE)
# html=urllib2.urlopen(url).read()

# h2 = 1
# p1 = 1
# for x in find_re.findall(html.decode()):
#     if len(x[2]) > 0:
#         h2 = h2+1
#         p1 = 1
#     if len(x[0]) > 0:
#         filename='%d_%d.txt' % (h2,p1)
#         savePage(filename, main_pager % x[1], x[0])
#         # delBr(filename)
#         p1=p1+1
#         # print ("title = ", x[0])
#         # print ('href = ', main_pager % x[1])
#         # print ('header = ', h2)
#     # print ('content = ', x[3])

# url = 'http://p18d.5ips.net/pingshu/武侠小说_碧血剑/武侠小说_碧血剑_01.mp3?key=a5428ae0e2ffe365459a829b201090c1_413319097'

# uni = url.decode('gb2312')
# utf = uni.encode('utf-8')
# print(url)
# print(utf)
# savePage('2.txt',utf, '123')

url = '%s/down_119_01.htm' % main_url

getNextPage(getPage(url))
# getPage(url)

# delBr('2.txt')

robin_chenyu

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
评书下载器

# -*- coding: cp936 -*-try: import urllib.request as urllib2except ImportError, details: import urllib2import urllibimport reimport osimport sysmain_url = 'http://www.5ips.net'def ge
复制链接

扫一扫