python 下载voa

最新推荐文章于 2024-07-12 16:16:27 发布

cichang53502

最新推荐文章于 2024-07-12 16:16:27 发布

阅读量138

点赞数

文章标签： python

原文链接：https://my.oschina.net/u/870538/blog/109361

版权

import sys
from sgmllib import SGMLParser
import urllib2,cookielib


class Myvoa():
'''
download texts and mp3 of http://www.51voa.com
'''
def __init__(self):
self.basicurl="http://www.51voa.com"
self.basicdir=os.curdir
self.urls=[]
try:
cookie=cookielib.CookieJar()
cookieProc=urllib2.HTTPCookieProcessor(cookie)
except:
raise
else:
opener=urllib2.build_opener(cookieProc)
urllib2.install_opener(opener)

def savefile(self,name,url=-1,data=-1):
path=os.curdir
name=name.replace(os.sep,'')
target=path+os.sep+name
print url
if os.path.isfile(target):
pass
else:
if url!=-1:
fp=open(path+os.sep+name,'wb')
try:
req = urllib2.Request(url)

res=urllib2.urlopen(req)
fp.write(res.read())
finally:
fp.close()

finally:
fp.close()


def dlvoa(self,url):
url =self.basicurl+url


req = urllib2.Request(url)
res= urllib2.urlopen(req)
data=res.read()
res.close()

data=str(data)
title = data[data.find('''<div id="title">'''):]
title = title[16:title.find('''</div>''')]
print '=---------------title'
title = title.rstrip().replace(r".","-").replace(r":","-").replace(r" ","-")

print title
data = data[data.find('''<div id="menubar">'''):]

data2=data[:data.find('''</div>''')]


mp3url=data2[data2.find('''<a href="''')+9:data2.find('''.mp3">''')+4]

self.savefile(title+'.mp3',url=mp3url)


def geturls(self):
exurl="/VOA_Standard_English"
#print self.basicurl+exurl
req = urllib2.Request(self.basicurl+exurl)
res=urllib2.urlopen(req)
data=res.read()
res.close()



startag='''<span id="blist"><ul>'''
endtag='''</ul></span>
'''
data=str(data)
data=data[data.find(startag):data.find(endtag)]

lister = URLLister()
lister.feed(data)
self.urls = lister.urls



def dlurls(self):
if self.urls==[]:
self.geturls()
list(map(self.dlvoa,self.urls))

class URLLister(SGMLParser):
def __init__(self):
SGMLParser.__init__(self)
self.urls = []
def start_a(self, attrs):
href = [v for k, v in attrs if k=='href']
if href:
self.urls.extend(href)

temp=Myvoa()
print 'begin downloading'
temp.geturls()
temp.dlurls()
print 'end downloading'

转载于:https://my.oschina.net/u/870538/blog/109361

cichang53502

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
python 下载voa

import sys from sgmllib import SGMLParser import urllib2,cookielib class Myvoa(): ''' download texts and ...
复制链接

扫一扫