# -*- coding: cp936 -*-
try:
import urllib.request as urllib2
except ImportError, details:
import urllib2
import urllib
import re
import os
import sys
main_url = 'http://www.5ips.net'
def getPage(parurl):
find_re = re.compile(r'<a href="(http://p18d.5ips.net/pingshu/([^?]+)?[^"]+)')
page=urllib2.urlopen(parurl).read()
arr = []
for x in find_re.findall(page):
arr.append(x)
print "out: ", x[0]
print "out2: ", x[1].split('/')[-1]
if len(arr) == 1:
down_mp3( arr[0][0].decode('gbk').encode('gb2312'), x[1].split('/')[-1])
return page
def getNextPage(page):
find_re = re.compile(r'A href="(down[^"]+htm)"')
arr = []
for x in find_re.findall(page):
nxturl = '%s/%s' % (main_url,x)
arr.append(nxturl)
print "next: ", nxturl
sys.stdout.flush()
if len(arr) == 1:
getNextPage(getPage(arr[0]))
def down_mp3(mp3_url, filename):
try:
open(filename, 'wb').write(urllib2.urlopen(mp3_url).read())
print 'Downloaded'
except details:
print mp3_url,' not downloaded', details
def savePage(filename, purl, tt):
""" read html contents from <purl>, and save as <filename>, the header in
save to the file is <tt>
"""
content=urllib2.urlopen(purl).read()
find_cnt=re.compile(r'id="chapterContent">(.+)<p class="recent_read"', re.S)
#print(content.decode(encoding='utf-8'))
with open(filename,mode='wb') as f:
# for x in find_cnt.findall(content.decode()):
# for x in content.decode(encoding='utf-8'):
# print(x)
f.write(tt)
f.write(content)
f.closed
def delBr(filename):
""" replace <br> with \n in files
"""
with open(filename, mode='r', encoding='utf-8') as f:
x = f.read()
y = re.sub(r'<br>', r'\n', x)
with open(filename+'_t', mode='w', encoding='utf-8') as ff:
ff.write(y)
ff.closed
f.closed
# url='http://www.17k.com/list/90206.html'
# main_pager='http://www.17k.com%s'
# find_re = re.compile(r'<a title="([^;]+);[^h]+href="([^"]+)"|<h2>(.+)<', re.UNICODE)
# html=urllib2.urlopen(url).read()
# h2 = 1
# p1 = 1
# for x in find_re.findall(html.decode()):
# if len(x[2]) > 0:
# h2 = h2+1
# p1 = 1
# if len(x[0]) > 0:
# filename='%d_%d.txt' % (h2,p1)
# savePage(filename, main_pager % x[1], x[0])
# # delBr(filename)
# p1=p1+1
# # print ("title = ", x[0])
# # print ('href = ', main_pager % x[1])
# # print ('header = ', h2)
# # print ('content = ', x[3])
# url = 'http://p18d.5ips.net/pingshu/武侠小说_碧血剑/武侠小说_碧血剑_01.mp3?key=a5428ae0e2ffe365459a829b201090c1_413319097'
# uni = url.decode('gb2312')
# utf = uni.encode('utf-8')
# print(url)
# print(utf)
# savePage('2.txt',utf, '123')
url = '%s/down_119_01.htm' % main_url
getNextPage(getPage(url))
# getPage(url)
# delBr('2.txt')