唱吧音乐爬取
import requests
import re
url = 'http://changba.com/u/461549830'
def changba(url):
res = requests.get(url)
if res.status_code == 200:
res_html = res.text
reg1 = '<a href="(/s/.{22})" style="color:#999;display:block;" target="_blank">'
media = re.findall(reg1, res_html)
media_mid = []
for i in range(len(media)):
media_mid.append('http://changba.com'+media[i])
print(media_mid)
reg2 = '^style="color:#999;display:block;" target="_blank">\n\t*([\u4e00-\u9fa5]{1,20})\t*<div class="userPage-work-detail">$'
reg2 = '(.*)<div class="userPage-work-detail">'
name = re.findall(reg2, res_html)
song_name = []
for i in range(len(name)):
song_name.append(name[i].strip())
print(song_name)
reg3 = 'http://\w{4,20}.changba.com/\d{10}.mp3'
for i in range(len(media_mid)):
result = requests.get(media_mid[i]).text
MP3_html = re.findall(reg3,result)
print(MP3_html)
if(MP3_html):
MP3 = requests.get(MP3_html[0])
if MP3.status_code ==200:
with open(song_name[i]+'.mp3','wb') as f:
f.write(MP3.content)
else:
continue
changba(url)
全民K歌音乐爬取
from urllib import request
import re
import os
import json
url = "https://kg.qq.com/node/personal?uid=6a9d9a81222830833c"
html = request.urlopen(url).read().decode('utf-8')
data = re.findall(r'"ugclist":.*?],', html)
ugclists = data[0][10:-1]
print(len(data[0]))
for ugclist in json.loads(ugclists):
print(ugclist['shareid'])
print(ugclist['title'])
title = ugclist['title']
shareid = ugclist['shareid']
data_url = "http://cgi.kg.qq.com/fcgi-bin/fcg_get_play_url?shareid=" + shareid
path = r'C:/Users/HUAWEI/Desktop/le'
file = path + title + '.m4a'
is_set = os.path.exists(file)
if is_set == False:
request.urlretrieve(data_url, path + title + '.m4a')
import urllib.request
import requests
import re
path = "C://Users//HUAWEI//Desktop//kgqq//"
url = "https://kg.qq.com/node/personal?uid=6a9d9a81222830833c"
headers = {'User-Agent':'User-Agent:Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)'}
req = urllib.request.Request(url,headers=headers)
res = urllib.request.urlopen(req)
html = res.read().decode("utf-8")
pat = '<a href="(.*?)" .*? target="_blank">(.*?)</a>'
dlist = re.findall(pat,html)
pat_music = 'http://[a-z][a-z].stream.kg.qq.com.*.m4a.*?"'
for url in dlist:
music = urllib.request.Request(url[0],headers=headers)
res = urllib.request.urlopen(music)
music_html = res.read().decode("utf-8")
mus = re.findall(pat_music,music_html)
if mus:
MP3 = requests.get(mus[0])
with open(path+url[1]+".mp3","wb") as f:
f.write(MP3.content)
if MP3.content:
print(url[1]+":"+url[0])
else:
print(url[1]+":"+url[0]+"write error!")