不务正业学了一点python,没啥可练习的,做个取百度的歌曲,很粗糙有bug,生成一个方便用迅雷的html就算了,没时间继续弄,贴上备忘。urllib用的是IE的设置,在IE内设置了代理服务器,urllib也用代理。
top50:http://top.baidu.com/mp3.html
import urllib
import os
import logging
from sgmllib import SGMLParser
class top50(SGMLParser):
'list top 50 mp3 at baidu.com'
top50urls=[]
def start_a(self, attrs):
href = [v for k, v in attrs if k=='href']
if href and href[0].find('wstsearch')>0:
self.top50urls.extend(href)
f=file('top50.html','w')
f.write(r'<html><head><meta http-equiv="Content-Type" content="text/html; charset=gb2312">')
f.write(r'</head><body><center><h1>Top 50 mp3</h1>(press mouse right key and select thunder to download them)</center><p><hr>')
f.write(r'<table>')
number=0
logging.basicConfig(level=logging.DEBUG,
format='%(asctime)s %(levelname)s %(message)s',
filename='download_baidu.log',
filemode='w')
print 'connecting baidu.com...'
html50=urllib.urlopen('http://top.baidu.com/mp3.html').read()
#html50=open(r'mp3.html').read()
i50=top50()
i50.feed(html50)
print 'we get',len(i50.top50urls),'songs'
for i in i50.top50urls:
a=i.replace(r'&',r'&')
b=a.replace(' ',r'%20')
html1=urllib.urlopen(b).read()
logging.info(b)
#html1=open(r'wstsearch.htm').read()
start=html1.find('http://220.181.27.54')
end=html1.find(r'"',start)
a=html1[start:end].replace(' ',r'%20')
#here find the first hyperlink to mp3 file
if(len(a)<10):
logging.debug(html1)
number=number+1
f.write('<tr><td>'+str(number)+'.'+'</td><td>baidu.com error!!!</td></tr>')
print number,'baidu.com error!!!'
continue #baidu.com have a error
html2=urllib.urlopen(a).read()
#html2=open(r'm.htm').read()
start=html2.find('href')
end=html2.find(r'"',start+10)
url=html2[start+6:end]
if(len(url)>100):
logging.debug(html2)
number=number+1
f.write('<tr><td>'+str(number)+'.'+'</td><td>baidu.com error!!!</td></tr>')
print number,'baidu.com error!!!'
continue #baidu.com have a error
end2=html2.find(r'</a>',end)
name=html2[end+18:end2]
number=number+1
#here find the mp3
#python download code is here,but we can use thunder
#f = urllib.urlopen(url)
#open(name,'wb').write(f.read())
f.write('<tr><td>'+str(number)+'.'+'</td><td><a href='+url+'>'+name+'</a></td></tr>')
print number,name
f.write(r'</table></body></html>')
f.close()
os.system('top50.html')
轻音乐:http://list.mp3.baidu.com/list/qingyinyue.html#top19
class topmusic(SGMLParser):
'list top music mp3 at baidu.com'
topurls=[]
def start_a(self, attrs):
href = [v for k, v in attrs if k=='href']
#logging.debug(href[0])
if href and href[0].find(r'508&word')>0:
self.topurls.extend(href)
top50:http://top.baidu.com/mp3.html
import urllib
import os
import logging
from sgmllib import SGMLParser
class top50(SGMLParser):
'list top 50 mp3 at baidu.com'
top50urls=[]
def start_a(self, attrs):
href = [v for k, v in attrs if k=='href']
if href and href[0].find('wstsearch')>0:
self.top50urls.extend(href)
f=file('top50.html','w')
f.write(r'<html><head><meta http-equiv="Content-Type" content="text/html; charset=gb2312">')
f.write(r'</head><body><center><h1>Top 50 mp3</h1>(press mouse right key and select thunder to download them)</center><p><hr>')
f.write(r'<table>')
number=0
logging.basicConfig(level=logging.DEBUG,
format='%(asctime)s %(levelname)s %(message)s',
filename='download_baidu.log',
filemode='w')
print 'connecting baidu.com...'
html50=urllib.urlopen('http://top.baidu.com/mp3.html').read()
#html50=open(r'mp3.html').read()
i50=top50()
i50.feed(html50)
print 'we get',len(i50.top50urls),'songs'
for i in i50.top50urls:
a=i.replace(r'&',r'&')
b=a.replace(' ',r'%20')
html1=urllib.urlopen(b).read()
logging.info(b)
#html1=open(r'wstsearch.htm').read()
start=html1.find('http://220.181.27.54')
end=html1.find(r'"',start)
a=html1[start:end].replace(' ',r'%20')
#here find the first hyperlink to mp3 file
if(len(a)<10):
logging.debug(html1)
number=number+1
f.write('<tr><td>'+str(number)+'.'+'</td><td>baidu.com error!!!</td></tr>')
print number,'baidu.com error!!!'
continue #baidu.com have a error
html2=urllib.urlopen(a).read()
#html2=open(r'm.htm').read()
start=html2.find('href')
end=html2.find(r'"',start+10)
url=html2[start+6:end]
if(len(url)>100):
logging.debug(html2)
number=number+1
f.write('<tr><td>'+str(number)+'.'+'</td><td>baidu.com error!!!</td></tr>')
print number,'baidu.com error!!!'
continue #baidu.com have a error
end2=html2.find(r'</a>',end)
name=html2[end+18:end2]
number=number+1
#here find the mp3
#python download code is here,but we can use thunder
#f = urllib.urlopen(url)
#open(name,'wb').write(f.read())
f.write('<tr><td>'+str(number)+'.'+'</td><td><a href='+url+'>'+name+'</a></td></tr>')
print number,name
f.write(r'</table></body></html>')
f.close()
os.system('top50.html')
轻音乐:http://list.mp3.baidu.com/list/qingyinyue.html#top19
class topmusic(SGMLParser):
'list top music mp3 at baidu.com'
topurls=[]
def start_a(self, attrs):
href = [v for k, v in attrs if k=='href']
#logging.debug(href[0])
if href and href[0].find(r'508&word')>0:
self.topurls.extend(href)