#python爬取音乐网站

最新推荐文章于 2024-08-16 11:38:09 发布

川xc

最新推荐文章于 2024-08-16 11:38:09 发布

阅读量1.9k

点赞数

分类专栏： python-爬虫文章标签： python-爬虫音乐

本文链接：https://blog.csdn.net/qq_35201754/article/details/71699541

版权

python-爬虫专栏收录该内容

3 篇文章 0 订阅

订阅专栏

今天学习了网上一段爬取音乐的代码，自己也作了些微改动是它能顺便爬取歌词，并整理了些知识点（对刚接触几天的自己确实是知识点）完整代码如下：

#!/usr/bin/python
# _*_ coding: utf-8 _*_
from bs4 import BeautifulSoup
import re
import requests
import json
import urllib2,urllib
import os


minimumsize = 1
def getlist(url):
    r = requests.get(url)
    content = r.content
    content = content.decode('utf-8')
    #print content

    soup = BeautifulSoup(content,'lxml')
    #mm = soup.find_all('span',class_='song-title')[0]
    mm=soup.find_all('span',class_='song-title')
    mms = mm
    i = 0
    for m in  mm:
        mm[i] = m.contents[0]#获取span后第一个标签
        i+=1
    return mm

url = "http://music.baidu.com/songlist/365418600"
list = getlist(url)

for value in list:
    url = 'http://sug.music.baidu.com/info/suggestion'
    payload = {'word': value.get_text(), 'version': '2', 'from': '0'}#根据百度音乐API编写
    print "Song Name: " + value.get_text()#value.get_text()获取a标签内的内容
    r = requests.get(url, params=payload)
    contents = r.text
    #print contents
    d = json.loads(contents, encoding="utf-8")
    #print d
    if('data' not in d):
        print "do not have flac\n"
        continue
    if('song' not in d["data"]):
        print "do not have flac\n"
        continue
    songid = d["data"]["song"][0]["songid"]
    #print "Song ID: " + songid
    url = 'http://play.baidu.com/data/cloud/songlink'
    payload = {'songIds': songid, 'type': 'mp3'}
    r = requests.get(url, params=payload)
    contents = r.text
    d = json.loads(contents, encoding="utf-8")
    #print d
    if d is not None and 'data' not in d or d['data'] == '':
        continue
    songlink = d["data"]["songList"][0]["songLink"]
    lrcLink =d["data"]["songList"][0]["lrcLink"]
    print "lrc:"+lrcLink
    if(len(songlink) < 10):
        print "do not have flac\n"
        continue
    #print "Song Source: " + songlink + "\n"

    songdir = "mm"#存放文件夹名称
    songdir_lrc = 'lrc'
    if not os.path.exists(songdir):#判断是否存在该文件夹
        os.makedirs(songdir)
    if not os.path.exists(songdir_lrc):#判断是否存在该文件夹
        os.makedirs(songdir_lrc)
    songname = d["data"]["songList"][0]["songName"]
    artistName = d["data"]["songList"][0]["artistName"]
    filename = "./" + songdir + "/" + songname + "-" + artistName + ".mp3"
    filename_lrc ="./"+songdir+"/"+songdir_lrc+"/"+ songname + "-" + artistName + ".lrc"
    f = urllib2.urlopen(songlink)



    headers = requests.head(songlink).headers
    #print headers
    size = int(headers['Content-Length']) / (1024 ** 2)
    #print size#歌曲大小
    if not os.path.isfile(filename) or os.path.getsize(filename) < minimumsize:
        print "%s is downloading now ......\n" % songname
        with open(filename, "wb") as code:
            code.write(f.read())
    else:
        print "%s is already downloaded. Finding next song...\n\n" % songname


    if  len(lrcLink) <8:
        print '歌词不存在'
        continue
    f_lrc = urllib2.urlopen(lrcLink)
    if not os.path.isfile(filename_lrc) :
        print "%s .lrc is downloading now ......\n" % songname
        with open(filename_lrc, "wb") as code:
            code.write(f_lrc.read())
    else:
        print "%s .lrc is already downloaded. Finding next song...\n\n" % songname
print "\n+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\n"
print "finish!"
# print list

百度音乐api搜索方式：
1.搜索建议
请求地址（GET）：http://sug.music.baidu.com/info/suggestion
参数：
format : ‘json’ （照写就可以）
word : ” （搜索关键词，支持拼音、拼音简写）
version : 2（照写）
from : 0（照写）
作用：
获取歌曲id（通过id获取歌曲信息）
获取歌曲名称
获取歌手名称
获取歌手图片（小图）
2请求地址（POST）：http://play.baidu.com/data/cloud/songlink
参数：
songIds : ”（要获取的歌曲信息的歌曲编号。可以多个，以逗号分隔）
hq:0
type:m4a,mp3
rate:
pt:0
flag:-1
s2p:-1
prerate:-1
bwt:-1
dur:-1
bat:-1
bp:-1
pos:-1
auto:-1
获取歌曲的lrc歌词地址（歌词地址为相对路径，请加上http://play.baidu.com）

知识点：
(1):requests爬取网页
r = requests.get(url)
content = r.content或contents = r.text
content获得网页内容
(2):获取标签内的无特殊标记的标签
这里写图片描述

   soup = BeautifulSoup(content,'lxml')
   mm=soup.find_all('span',class_='song-title')
   i = 0
   for m in  mm:
       mm[i] = m.contents[0]#获取span后第一个标签
       i+=1
   return mm

mm里是上图的内容，mm是ResultSet，for取出后用m.contents[]取span标签后的html标签
(3)json.loads
重要函数：
编码：把一个Python对象编码转换成Json字符串 json.dumps()
解码：把Json格式字符串解码转换成Python对象 json.loads()
这里写图片描述
另外python解析json的例子

#!/usr/bin/python
import json
#Function:Analyze json script
#Json is a script can descript data structure as xml, 
#for detail, please refer to "http://json.org/json-zh.html".
#Note:
#1.Also, if you write json script from python,
#you should use dump instead of load. pleaser refer to "help(json)".

json file:
The file content of temp.json is:
{
 "name":"00_sample_case1",
 "description":"an example."
}
f = file("temp.json");
s = json.load(f)
print s
f.close
json string:
s = json.loads('{"name":"test", "type":{"name":"seq", "parameter":["1", "2"]}}')
print s
print s.keys()
print s["name"]
print s["type"]["name"]
print s["type"]["parameter"][1]

(4)读写文件的模式

模式	描述
r	打开一个文件为只读。文件指针置于该文件的开头。这是默认模式。
rb	打开一个文件只能以二进制格式读取。文件指针置于该文件的开头。这是默认模式。
r+	打开用于读取和写入文件。文件指针将会在文件的开头。
rb+	打开用于读取和写入二进制格式的文件。文件指针将会在文件的开头。
w	打开一个文件只写。覆盖该文件，如果该文件存在。如果该文件不存在，则创建用于写入一个新的文件。
wb	打开一个文件只能以二进制格式写入。覆盖该文件，如果该文件存在。如果该文件不存在，则创建用于写入一个新的文件。
w+	打开用于写入和读取的文件。覆盖现有的文件，如果文件存在。如果该文件不存在，则创建读取和写入新的文件。
wb+	打开用于写入和读取的二进制格式的文件。覆盖现有的文件，如果文件存在。如果该文件不存在，则创建读取和写入新的文件。
a	将打开追加文件。文件指针是在文件的结尾。也就是说，该文件是在附加模式。如果该文件不存在，它创造了写入一个新的文件。
ab	将打开追加的二进制格式的文件。文件指针在该文件的结束。也就是说，该文件为追加模式。如果该文件不存在，它创建并写入一个新的文件。
a+	打开为追加和读取文件。文件指针在该文件的结束。该文件将为追加模式。如果该文件不存在，它创建并读取和写入的新文件。
ab+	打开两个追加和读取的二进制格式的文件。文件指针在该文件的结束。该文件将在追加模式。如果该文件不存在，它创建并读取和写入的新文件。

(5)with语句

 with open(filename_lrc, "wb") as code:
        code.write(f_lrc.read())

相当于

try:
    code = open(filename_lrc,'wb')
except:
    print 'fail to open'
    exit(-1)
try:
    code.write(f_lrc.read())
except:

finally:
     f.close()

多个项时:

with open("filename.txt") as fn1, open('filename.txt') as fn2:
    do something with fn1,fn2

如：

with nested(open('file1'), open('file2'), open('file3')) as (f1,f2,f3):
    for i in f1:
        j = f2.readline()
        k = f3.readline()
        print(i,j,k)

with open('file1') as f1, open('file2') as f2, open('file3') as f3:
    for i in f1:
        j = f2.readline()
        k = f3.readline()
        print(i,j,k)