import sys
from sgmllib import SGMLParser
import urllib2,cookielib
class Myvoa():
'''
download texts and mp3 of http://www.51voa.com
'''
def __init__(self):
self.basicurl="http://www.51voa.com"
self.basicdir=os.curdir
self.urls=[]
try:
cookie=cookielib.CookieJar()
cookieProc=urllib2.HTTPCookieProcessor(cookie)
except:
raise
else:
opener=urllib2.build_opener(cookieProc)
urllib2.install_opener(opener)
def savefile(self,name,url=-1,data=-1):
path=os.curdir
name=name.replace(os.sep,'')
target=path+os.sep+name
print url
if os.path.isfile(target):
pass
else:
if url!=-1:
fp=open(path+os.sep+name,'wb')
try:
req = urllib2.Request(url)
res=urllib2.urlopen(req)
fp.write(res.read())
finally:
fp.close()
finally:
fp.close()
def dlvoa(self,url):
url =self.basicurl+url
req = urllib2.Request(url)
res= urllib2.urlopen(req)
data=res.read()
res.close()
data=str(data)
title = data[data.find('''<div id="title">'''):]
title = title[16:title.find('''</div>''')]
print '=---------------title'
title = title.rstrip().replace(r".","-").replace(r":","-").replace(r" ","-")
print title
data = data[data.find('''<div id="menubar">'''):]
data2=data[:data.find('''</div>''')]
mp3url=data2[data2.find('''<a href="''')+9:data2.find('''.mp3">''')+4]
self.savefile(title+'.mp3',url=mp3url)
def geturls(self):
exurl="/VOA_Standard_English"
#print self.basicurl+exurl
req = urllib2.Request(self.basicurl+exurl)
res=urllib2.urlopen(req)
data=res.read()
res.close()
startag='''<span id="blist"><ul>'''
endtag='''</ul></span>
'''
data=str(data)
data=data[data.find(startag):data.find(endtag)]
lister = URLLister()
lister.feed(data)
self.urls = lister.urls
def dlurls(self):
if self.urls==[]:
self.geturls()
list(map(self.dlvoa,self.urls))
class URLLister(SGMLParser):
def __init__(self):
SGMLParser.__init__(self)
self.urls = []
def start_a(self, attrs):
href = [v for k, v in attrs if k=='href']
if href:
self.urls.extend(href)
temp=Myvoa()
print 'begin downloading'
temp.geturls()
temp.dlurls()
print 'end downloading'
转载于:https://my.oschina.net/u/870538/blog/109361