1 运行wget -i Baidump3URL.txt
2 运行BaiduMp3.py> baidump3.txt
3 运行del *.htm?
获得baidu所有歌曲的名字列表
Baidump3URL.txt:
http://list.mp3.baidu.com/song/A.htm
http://list.mp3.baidu.com/song/B.htm
http://list.mp3.baidu.com/song/C.htm
http://list.mp3.baidu.com/song/D.htm
http://list.mp3.baidu.com/song/E.htm
http://list.mp3.baidu.com/song/F.htm
http://list.mp3.baidu.com/song/G.htm
http://list.mp3.baidu.com/song/H.htm
http://list.mp3.baidu.com/song/J.htm
http://list.mp3.baidu.com/song/K.htm
http://list.mp3.baidu.com/song/L.htm
http://list.mp3.baidu.com/song/M.htm
http://list.mp3.baidu.com/song/N.htm
http://list.mp3.baidu.com/song/O.htm
http://list.mp3.baidu.com/song/P.htm
http://list.mp3.baidu.com/song/Q.htm
http://list.mp3.baidu.com/song/R.htm
http://list.mp3.baidu.com/song/S.htm
http://list.mp3.baidu.com/song/T.htm
http://list.mp3.baidu.com/song/W.htm
http://list.mp3.baidu.com/song/X.htm
http://list.mp3.baidu.com/song/Y.htm
http://list.mp3.baidu.com/song/Z.htm
BaiduMp3.py:
#!/usr/bin/python
import urllib
import string
import re
def GetContent (url):
try:
URLFile=urllib.urlopen(url)
except IOError:
print "\nCan not retrieve ",url,"!\nThe connection cannot be made!\n"
else:
HTMLText=URLFile.read()
URLFile.close()
return HTMLText
if(__name__=="__main__"):
file=open('Baidump3URL.txt','r')
fileread=file.read()
urls=fileread.split('\n')
queue=[]
#
regexp=re.compile(r'" target=_blank>(.*?)</[aA]></td>')
for url in urls:
#print url
url=re.sub('http://list.mp3.baidu.com/song','.',url)
#print url
content=open(url,'r').read()
lines=content.split('\n')
for line in lines:
#print line
ccc=regexp.search(line)
if(ccc):
word=ccc.groups()[0]
if word in queue:
pass
else:
queue.append(word)
#print url
file.close()
regexp1=re.compile(r'[- ](.*)')
for w in queue:
w = unicode(w,'cp936')
w = w.encode('utf8')
ccc=regexp1.search(w)
if(ccc):
w=ccc.groups()[0]
#print w
w=re.sub('《|》|,|\.|·|!','',w)
if(''==w):
continue
print w