根据获得歌手的json数据的url
json数据部分截图
根据上面链接里获得数据,取出歌手mid,然后再根据下面的url获得歌曲列表
json数据部分截图
下面是调用的详细代码,其中的HTTPUtil.sendGet方法是发送http请求的代码。在我这个博客
https://blog.csdn.net/Hello_Ray/article/details/80762232 里的代码11行代码和42-48行有http发送请求的代码,可以参考那个。
在测试这个方法,dao方法可以注释掉, HotSongSpider是引入别的类中转换list的方法,可以注释掉。
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.jsoup.Jsoup;
import org.springframework.stereotype.Component;
import javax.annotation.Resource;
import java.net.URLEncoder;
import java.nio.charset.StandardCharsets;
import java.util.*;
/**
* <p>description: </p>
*
* @author chenrui
* @since 2018-08-09
*/
@Component
public class SpiderSingerNew {
private static Logger log = LogManager.getLogger(SpiderSingerNew.class);
@Resource
private HotSongSpider hotSongSpider;
@Resource
private HotSongDao hotSongDao;
private Integer singerListTotalPage = 1; //所有页数
private Integer singerSongListTotalPage = 1;
//成功的歌手列表
List<QQ2Singer> singerList = new ArrayList<>();
//爬取歌手列表出现的错误标签和页面
public static Queue<Map<String, Integer>> errorSingerList = new LinkedList<>();
public static Queue<Map<String, String>> errorSingerSongList = new LinkedList<>();
//爬取所有歌手的url
String singerListUrl = "https://u.y.qq.com/cgi-bin/musicu.fcg?format=jsonp&inCharset=utf8&outCharset=utf-8&data=";
String param = "{\"comm\":{\"ct\":24,\"cv\":10000},\"singerList\":{\"module\":\"Music.SingerListServer\",\"method\":\"get_singer_list\",\"param\":{\"area\":-100,\"sex\":-100,\"genre\":-100,\"index\":{index},\"sin\":{sin},\"cur_page\":{cur_page}}}}";
//歌手下的全部歌曲
String singerSongListUrl = "https://c.y.qq.com/v8/fcg-bin/fcg_v8_singer_track_cp.fcg?inCharset=utf8&outCharset=utf-8¬ice=0&platform=yqq&needNewCode=0&singermid={singermid}&order=listen&begin={begin}&num={num}";
public void spiderSingerList() {
log.info("--start spider singer list--");
int index = 1;
//singer的list
List<QQ2Singer> qq2SingerList = new ArrayList<>();
try {
for(; index<=27; index++) {
for(int i=0; i < singerListTotalPage; i++) {
String sendParam = param.replace("{index}", String.valueOf(index)).replace("{sin}", String.valueOf(i*80)).replace("{cur_page}", String.valueOf(i+1));
String sendUrl = singerListUrl + URLEncoder.encode(sendParam, String.valueOf(StandardCharsets.UTF_8));
String content = HTTPUtil.sendGet(sendUrl).getContent();
parseSingerList(content, index, i);
}
//TODO 计算总页数
singerListTotalPage=1;
}
} catch (Exception e) {
log.error("error : start spider singer list {} {}", e.getMessage(), e.getCause());
e.printStackTrace();
}
//循环处理出错的歌手,最多四次
if(errorSingerList.size() > 0) {
int singerEndlessFlag = 1;
do {
spiderErrorSingerList();
singerEndlessFlag++;
} while(singerEndlessFlag < 4 && errorSingerList.size()>0);
}
log.info("--finish spider singer list--");
log.info("--start spider singer song list--");
for(int i=0; i<singerList.size();i++) {
String singermid = singerList.get(i).getSinger_mid();
System.out.println("--开始第"+ (i+1) +"/"+singerList.size()+" singermid: "+ singermid +" 歌手的歌曲爬取--");
spiderSingerSongList(singermid);
}
log.info("--处理歌手歌曲下的错误--");
if(errorSingerSongList.size() > 0) {
//循环处理出错的歌手,最多四次
int songEndlessFlag = 1;
do {
spiderErrorSingerSongList();
songEndlessFlag++;
} while(errorSingerSongList.size()>0 && songEndlessFlag < 4 );
}
log.info("--finish spider singer song list--");
}
private int sum = 0;
/**
* singerlist是全局变量
* 爬取全网的歌手速度很快,不用受到限制,所以先将歌手爬完再去爬取歌曲。
* @param content
* @param index
* @param page
*/
public void parseSingerList(String content, int index, int page) {
log.info("开始 index:{} page:{} 将singer的json数组转为singer的list集合", index, page+1);
JSONObject json = JSONObject.parseObject(content);
try {
//TODO 计算总页数
if(singerListTotalPage<=1) {
int total = json.getJSONObject("singerList").getJSONObject("data").getInteger("total");
sum += total;
System.out.println("singer list的index: "+index+" page: "+ page +" " + total + " sum " + sum);
singerListTotalPage = (total + 80 - 1) / 80;
}
//将singer的json数组转为singer的list集合
JSONArray singerListJsonArray = json.getJSONObject("singerList").getJSONObject("data").getJSONArray("singerlist");
for(int i=0; i<singerListJsonArray.size(); i++) {
QQ2Singer qq2Singer = singerListJsonArray.getJSONObject(i).toJavaObject(QQ2Singer.class);
//将歌手信息添加到全局变量中
singerList.add(qq2Singer);
/*hotSongDao.saveQQSinger(qq2Singer);*/
hotSongDao.update(qq2Singer);
}
} catch (Exception e) {
log.error("转换singerlist,url标签位置{}, 所在页面{}, 出现错误 {} {}", index, page,e.getMessage(), e.getCause());
Map<String , Integer> errorInfo = new HashMap<>();
errorInfo.put("index", index);
errorInfo.put("page", page);
errorSingerList.add(errorInfo);
}
log.info("完成 index:{} page:{} 将singer的json数组转为singer的list集合", index, page+1);
}
/**
* 继续根据歌手列表出现的错误标签和页面爬取歌手
* errorSingerList定义的全局变量
*/
public void spiderErrorSingerList() {
Map<String, Integer> errorMap = errorSingerList.poll();
while(errorMap!= null) {
int index = errorMap.get("index");
int page = errorMap.get("page");
try {
String sendParam = param.replace("{index}", String.valueOf(index)).replace("{sin}", String.valueOf(page*80)).replace("{cur_page}", String.valueOf(page+1));
String sendUrl = singerListUrl + URLEncoder.encode(sendParam, String.valueOf(StandardCharsets.UTF_8));
String content = HTTPUtil.sendGet(sendUrl).getContent();
//根据index和page爬取对应的歌手
parseSingerList(content, index, page);
errorMap = errorSingerList.poll();
} catch (Exception e) {
Map<String , Integer> errorInfo = new HashMap<>();
errorInfo.put("index", index);
errorInfo.put("page", page);
errorSingerList.add(errorInfo);
}
}
}
/**
* 爬取歌手下的全部歌曲
* @param singerMid
*/
public void spiderSingerSongList(String singerMid) {
List<HotSong> hotSongs = new ArrayList<>();
try {
for(int i=0; i<singerSongListTotalPage; i++) {
String sendUrl = singerSongListUrl.replace("{singermid}", singerMid).replace("{begin}", String.valueOf(30*i)).replace("{num}", String.valueOf(30));
String content = HTTPUtil.sendGet(sendUrl).getContent();
parseSingerSongList(content, hotSongs, singerMid, i);
}
//TODO
singerSongListTotalPage=1;
} catch (Exception e) {
log.error("爬取 {} 歌手下面的歌曲出现错误", singerMid);
}
List<Song> songList = hotSongSpider.transToPO(hotSongs);
hotSongDao.saveOrUpdate(songList);
}
/**
* content
* @param content
* @param singerMid
*/
public void parseSingerSongList(String content, List<HotSong> hotSongs, String singerMid, int page) {
JSONObject json = JSONObject.parseObject(content);
//TODO
if(singerSongListTotalPage<=1) {
int total = json.getJSONObject("data").getInteger("total");
singerSongListTotalPage = (total + 30-1)/30;
}
JSONArray jsonSongArray = json.getJSONObject("data").getJSONArray("list");
try {
for(int i=0; i < jsonSongArray.size(); i++) {
HotSong song = jsonSongArray.getJSONObject(i).getJSONObject("musicData").toJavaObject(HotSong.class);
//获得album_id爬取相应的图片
String mid = song.getAlbummid();
String albumImg = null;
try {
//根据url拼接url+album_mid组成完成url,比如https://y.qq.com/n/yqq/album/004PCOKh1RUAqZ.html
String albumImgUri = MusicConstants.ALBUM_IMG_URI + mid + ".html";
//降低爬取速度
String html = HTTPUtil.sendGet(albumImgUri).getContent();
albumImg = Jsoup.parse(html).getElementById("albumImg").attr("src");
} catch (Exception e) {
}
song.setAlbumImg(albumImg);
hotSongs.add(song);
}
} catch (Exception e) {
Map<String , String> errorInfo = new HashMap<>();
errorInfo.put("singerMid", singerMid);
errorInfo.put("page", String.valueOf(page));
errorSingerSongList.add(errorInfo);
log.error("爬取 {} 歌手, page: {} 下面的歌曲出现错误", singerMid, page);
}
}
/**
* 继续根据歌手列表出现的错误标签和页面爬取歌手
* errorSingerList定义的全局变量
*/
public void spiderErrorSingerSongList() {
Map<String, String> errorMap = errorSingerSongList.poll();
while(errorMap!= null) {
List<HotSong> hotSongs = new ArrayList<>();
String singerMid = errorMap.get("singerMid");
int page = Integer.parseInt(errorMap.get("page"));
try {
String sendUrl = singerSongListUrl.replace("{singermid}", singerMid).replace("{begin}", String.valueOf(30*page)).replace("{num}", String.valueOf(30));
String content = HTTPUtil.sendGet(sendUrl).getContent();
parseSingerSongList(content, hotSongs, singerMid, page);
List<Song> songList = hotSongSpider.transToPO(hotSongs);
//保存歌曲
hotSongDao.saveOrUpdate(songList);
errorMap = errorSingerSongList.poll();
} catch (Exception e) {
Map<String , String> errorInfo = new HashMap<>();
errorInfo.put("singerMid", singerMid);
errorInfo.put("page", String.valueOf(page));
errorSingerSongList.add(errorInfo);
}
}
}
}
非常感谢有小伙伴能够重复和测试我的代码,如果在重复中出现问题,或者有关键方法没有引入。请联系我,或者留言。
特别注意:由于水平有限代码肯定存在不足的地方,希望能够指正,那么我将做到更好。
email:chenrui@marsdl.com