实战:爬取音乐网站(Requests)
import re # python 的正则库
import requests # python 的requests库
import time
# page=int(input("请输入您要爬取的页数:"))
songID=[]
songName=[]
page_size = int(input("请问要爬取第几页呢:"))
for i in range(0,page_size):
url="http://www.htqyy.com/top/musicList/hot?pageIndex="+str(i)+"&pageSize=20"
#$url = "http://www.htqyy.com/top/hot"
#构造请求头信息
header = {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
"Accept-Encoding": "gzip, deflate",
"Accept-Language": "zh-CN,zh;q=0.9",
"Cache-Control": "max-age=0",
"Connection": "keep-alive",
#"Cookie":" __cfduid=d54ff2470d42e999d942b3b64cc266a9f1594821751; BAIDU_SSP_lcr=https://www.baidu.com/link?url=zPqjAbMqG9O52ECGWPxIoo5nIDVDpw6DO0i0JHZqbgy&wd=&eqid=e3b52e830007c46a000000035f0f0c70; blk=0; Hm_lvt_74e11efe27096f6ef1745cd53f168168=1594821752; isPlay=0; jploop=false; Hm_lpvt_74e11efe27096f6ef1745cd53f168168=1594828497",
"Host": "www.htqyy.com",
"Referer": "http://www.htqyy.com/top/hot",
"Upgrade-Insecure-Requests": "1",
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36"
}
#获取音乐榜单的网页信息
html=requests.get(url,headers = header)
strr=html.text
pat1=r'title="(.*?)" sid'
pat2=r'sid="(.*?)"'
idlist=re.findall(pat2,strr)
titlelist=re.findall(pat1,strr)
songID.extend(idlist)
songName.extend(titlelist)
for i in range(0,len(songID)):
songurl="http://f2.htqyy.com/play8/"+str(songID[i])+"/mp3/7"
songname=songName[i]
data=requests.get(songurl).content
print("正在下载第",i+1,"首,""歌曲名为:",songName[i])
with open("/Users/yuanshuai/Downloads/music/{}.mp3".format(songname),"wb") as f:
f.write(data)
time.sleep(0.5)