IT小白终于脱离hello world,实现用java爬取网易云热评并制作词云,激动激动激动。
假期被游戏缠绕无法自拔?或是被淡黄的长裙蓬松的头发洗脑?是否因为假期太长而闲着不知道干嘛呢,还不来敲代码!用java一起来爬个虫?
一、自我介绍
南京某高校大学生,今年寒假因为疫情,假期无限延长aaa,自己在家也不知道学点什么,有一段时间很迷茫,后来因为机缘巧合,碰到了一些it大佬,于是乎,我就代码敲敲敲,啊,这不,已经可以做个小爬虫啦哈哈哈
二、上代码
这个小项目是用java爬取网易云热门歌曲并制作热评的热词云图,效果如下:
先列一下pom里加的依赖吧
<!-- https://mvnrepository.com/artifact/com.squareup.okhttp3/okhttp -->
<dependencies>
<dependency>
<groupId>com.squareup.okhttp3</groupId>
<artifactId>okhttp</artifactId>
<version>4.1.0</version>
</dependency>
<!-- JSON 操作库 -->
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>fastjson</artifactId>
<version>1.2.62</version>
</dependency>
<!-- 下面tokenizers是为了中文分词引入 -->
<dependency>
<groupId>com.kennycason</groupId>
<artifactId>kumo-tokenizers</artifactId>
<version>1.17</version>
</dependency>
这个是项目结构
1、封装各个model类
专辑类
package com.youkeda.music.model;
/**
*专辑类
*/
public class Album {
private String id;
private String name;
private String picUrl;
public String getId() {
return id;
}
public void setId(String id) {
this.id = id;
}
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
public String getPicUrl() {
return picUrl;
}
public void setPicUrl(String picUrl) {
this.picUrl = picUrl;
}
}
歌曲对象
package com.youkeda.music.model;
import java.util.List;
/**
* 歌单对象
*/
public class Artist {
private String id;
private List<String> alias;
private String picUrl;
private String briefDesc;
private String img1v1Url;
private String name;
// 包含一组歌曲
private List<Song> songList;
public String getId() {
return id;
}
public void setId(String id) {
this.id = id;
}
public List<String> getAlias() {
return alias;
}
public void setAlias(List<String> alias) {
this.alias = alias;
}
public String getPicUrl() {
return picUrl;
}
public void setPicUrl(String picUrl) {
this.picUrl = picUrl;
}
public String getBriefDesc() {
return briefDesc;
}
public void setBriefDesc(String briefDesc) {
this.briefDesc = briefDesc;
}
public String getImg1v1Url() {
return img1v1Url;
}
public void setImg1v1Url(String img1v1Url) {
this.img1v1Url = img1v1Url;
}
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
public List<Song> getSongList() {
return songList;
}
public void setSongList(List<Song> songList) {
this.songList = songList;
}
}
评论类
package com.youkeda.music.model;
//评论类
public class Comment {
private String id;
private String content;
private String likedCount;
private String time;
private User commentUser;
public String getId() {
return id;
}
public void setId(String id) {
this.id = id;
}
public String getContent() {
return content;
}
public void setContent(String content) {
this.content = content;
}
public String getLikedCount() {
return likedCount;
}
public void setLikedCount(String likedCount) {
this.likedCount = likedCount;
}
public String getTime() {
return time;
}
public void setTime(String time) {
this.time = time;
}
public User getCommentUser() {
return commentUser;
}
public void setCommentUser(User commentUser) {
this.commentUser = commentUser;
}
}
歌曲类
package com.youkeda.music.model;
//歌曲类
import java.util.List;
public class Song {
private String id;
private String name;
private List<User> singers;
private String sourceUrl;
private Album album;
private List<Comment> hotComments;
private List<Comment> comments;
public List<User> getSingers() {
return singers;
}
public void setSingers(List<User> singers) {
this.singers = singers;
}
public String getSourceUrl() {
return sourceUrl;
}
public void setSourceUrl(String sourceUrl) {
this.sourceUrl = sourceUrl;
}
public Album getAlbum() {
return album;
}
public void setAlbum(Album album) {
this.album = album;
}
public List<Comment> getHotComments() {
return hotComments;
}
public void setHotComments(List<Comment> hotComments) {
this.hotComments = hotComments;
}
public List<Comment> getComments() {
return comments;
}
public void setComments(List<Comment> comments) {
this.comments = comments;
}
public String getId() {
return id;
}
public void setId(String id) {
this.id = id;
}
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
}
评论者
package com.youkeda.music.model;
/*
评论者
*/
public class User {
private String id;
private String nickName;
private String avatar;//头像
public String getId() {
return id;
}
public void setId(String id) {
this.id = id;
}
public String getNickName() {
return nickName;
}
public void setNickName(String nickName) {
this.nickName = nickName;
}
public String getAvatar() {
return avatar;
}
public void setAvatar(String avatar) {
this.avatar = avatar;
}
}
2、实现各项功能的service类
service接口类
package com.youkeda.music.service;
import com.youkeda.music.model.Artist;
import com.youkeda.music.model.Song;
//定义service接口
public interface SongCrawlerService {
void start(String artistId);
Artist getArtist(String artistId);
Song getSong(String artistId, String songId);
}
接口实现类
package com.youkeda.music.service.impl;
import com.alibaba.fastjson.JSON;
import com.youkeda.music.model.*;
import com.youkeda.music.service.SongCrawlerService;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import com.youkeda.music.util.WordCloudUtil;
import okhttp3.Call;
import okhttp3.OkHttpClient;
import okhttp3.Request;
/**
* 音乐抓取服务的实现
*/
public class SongCrawlerServiceImpl implements SongCrawlerService {
private static final String ARTIEST_API_PREFIX = "http://neteaseapi.youkeda.com:3000/artists?id=";
private static final String S_D_API_PREFIX = "http://neteaseapi.youkeda.com:3000/song/detail?ids=";
private static final String S_C_API_PREFIX = "http://neteaseapi.youkeda.com:3000/comment/music?id=";
private static final String S_F_API_PREFIX = "http://neteaseapi.youkeda.com:3000/song/url?id=";
// okHttpClient 实例
private OkHttpClient okHttpClient;
// 歌单数据仓库
private Map<String, Artist> artists;
private void init() {
//1. 构建 okHttpClient 实例
okHttpClient = new OkHttpClient();
artists = new HashMap<>();
}
@Override
public void start(String artistId) {
// 参数判断,未输入参数则直接返回
if (artistId == null || artistId.equals("")) {
return;
}
// 执行初始化
init();
//各个方法都重构封装,减少耦合性
initArtistHotSongs(artistId);
assembleSongDetail(artistId);
assembleSongComment(artistId);
assembleSongUrl(artistId);
generateWordCloud(artistId);
}
@Override
public Artist getArtist(String artistId) {
return artists.get(artistId);
}
@Override
public Song getSong(String artistId, String songId) {
Artist artist = artists.get(artistId);
List<Song> songs = artist.getSongList();
if (songs == null) {
return null;
}
for (Song song : songs) {
if (song.getId().equals(songId)) {
return song;
}
}
return null;
}
@SuppressWarnings("unchecked")
private Map getSourceDataObj(String prefix, String postfix) {
// 构建歌单url
String aUrl = prefix + postfix;
// 调用 okhttp3 获取返回数据
String content = getPageContentSync(aUrl);
// 反序列化成 Map 对象
Map returnData = JSON.parseObject(content, Map.class);
return returnData;
}
@SuppressWarnings("unchecked")
private Artist buildArtist(Map returnData) {
// 从 Map 对象中取得 歌单 数据。歌单也是一个子 Map 对象。
Map artistData = (Map) returnData.get("artist");
Artist artist = new Artist();
artist.setId(artistData.get("id").toString());
if (artistData.get("picUrl") != null) {
artist.setPicUrl(artistData.get("picUrl").toString());
}
artist.setBriefDesc(artistData.get("briefDesc").toString());
artist.setImg1v1Url(artistData.get("img1v1Url").toString());
artist.setName(artistData.get("name").toString());
artist.setAlias((List) artistData.get("alias"));
return artist;
}
private List<Song> buildSongs(Map returnData) {
// 从 Map 对象中取得一组 歌曲 数据
List songsData = (List) returnData.get("hotSongs");
List<Song> songs = new ArrayList<>();
for (int i = 0; i < songsData.size(); i++) {
Map songData = (Map) songsData.get(i);
Song songObj = new Song();
songObj.setId(songData.get("id").toString());
songObj.setName(songData.get("name").toString());
songs.add(songObj);
}
return songs;
}
/**
* 根据输入的url,读取页面内容并返回
*/
private String getPageContentSync(String url) {
//2.定义一个request
Request request = new Request.Builder().url(url).build();
//3.使用client去请求
Call call = okHttpClient.newCall(request);
String result = null;
try {
//4.获得返回结果
result = call.execute().body().string();
System.out.println("call " + url + " , content's size=" + result.length());
} catch (IOException e) {
System.out.println("request " + url + " error . ");
e.printStackTrace();
}
return result;
}
/**
* 初始化歌单及热门歌曲
*
* @param artistId
*/
private void initArtistHotSongs(String artistId) {
// 取得整体数据对象。
Map returnData = getSourceDataObj(ARTIEST_API_PREFIX, artistId);
// 构建填充了属性的 Artist 实例
Artist artist = buildArtist(returnData);
// 构建一组填充了属性的 Song 实例
List<Song> songs = buildSongs(returnData);
// 歌曲填入歌单
artist.setSongList(songs);
// 存入本地
artists.put(artist.getId(), artist);
}
/**
* 装配 歌曲详情
*
* @param artistId
*/
@SuppressWarnings("unchecked")
private void assembleSongDetail(String artistId) {
Artist artist = getArtist(artistId);
// 取不到歌单说明参数输入错误
if (artist == null) {
return;
}
List<Song> songs = artist.getSongList();
// 一个歌单中所有歌曲的id,组装成用逗号分割的字符串,形如:347230,347231。记住这个用法,很方便
String sIdsParam = buildManyIdParam(songs);
// 抓取结果
Map songsDetailObj = getSourceDataObj(S_D_API_PREFIX, sIdsParam);
// 原始数据中的 songs 是歌曲列表
List<Map> sourceSongs = (List<Map>) songsDetailObj.get("songs");
// 临时的 Map
Map<String, Map> sourceSongsMap = new HashMap<>();
// 遍历歌曲列表
for (Map songSourceData : sourceSongs) {
String sId = songSourceData.get("id").toString();
// 原始歌曲数据对象放入一个临时的 Map 中
sourceSongsMap.put(sId, songSourceData);
}
// 再次遍历歌单中的歌曲,填入详情数据
for (Song song : songs) {
String sId = song.getId();
// 从临时的Map中取得对应的歌曲源数据,使用id直接获取,比较方便
Map songSourceData = sourceSongsMap.get(sId);
// 源歌曲数据中,ar 字段是歌手列表
List<Map> singersData = (List<Map>) songSourceData.get("ar");
// 歌手集合
List<User> singers = new ArrayList<>();
for (Map singerData : singersData) {
// 歌手对象
User singer = new User();
singer.setId(singerData.get("id").toString());
singer.setNickName(singerData.get("name").toString());
// 歌手集合放入歌手对象
singers.add(singer);
}
// 歌手集合放入歌曲
song.setSingers(singers);
// 专辑
Map albumData = (Map) songSourceData.get("al");
Album album = new Album();
album.setId(albumData.get("id").toString());
album.setName(albumData.get("name").toString());
if (albumData.get("picUrl") != null) {
album.setPicUrl(albumData.get("picUrl").toString());
}
// 专辑对象放入歌曲
song.setAlbum(album);
}
}
/**
* 装配 歌曲评论
*
* @param artistId
*/
private void assembleSongComment(String artistId) {
Artist artist = getArtist(artistId);
List<Song> songs = artist.getSongList();
for (Song song : songs) {
String sIdsParam = song.getId() + "&limit=5";
// 抓取结果
Map songsCommontObj = getSourceDataObj(S_C_API_PREFIX, sIdsParam);
List<Map> hotCommentsData = (List<Map>) songsCommontObj.get("hotComments");
List<Map> commentsData = (List<Map>) songsCommontObj.get("comments");
List<Comment> hotComments = getComments(hotCommentsData);
List<Comment> commments = getComments(commentsData);
song.setComments(commments);
song.setHotComments(hotComments);
}
}
/**
* 装配 歌曲链接
*
* @param artistId
*/
// private void assembleSongUrl(String artistId) {
// Artist artist = getArtist(artistId);
// // 删除其它语句,保留必要的语句
// List<Song> songs = artist.getSongList();
// String sIdsParam = buildManyIdParam(songs);
// // 抓取结果
// Map songsFileObj = getSourceDataObj(S_F_API_PREFIX, sIdsParam);
// List data = (List) songsFileObj.get("data");
// Map map = (Map) data.get(0);
// for (Song song:songs){
// song.setSourceUrl((String) map.get("url"));
// }
//
// }
@SuppressWarnings("unchecked")
private void assembleSongUrl(String artistId) {
Artist artist = getArtist(artistId);
// 取不到歌单说明参数输入错误
if (artist == null) {
return;
}
// 删除其它语句,保留必要的语句
List<Song> songs = artist.getSongList();
String sIdsParam = buildManyIdParam(songs);
// 抓取结果
Map songsFileObj = getSourceDataObj(S_F_API_PREFIX, sIdsParam);
// 原始数据中的 data 是音乐文件列表
List<Map> datas = (List<Map>) songsFileObj.get("data");
// 临时的 Map
Map<String, Map> sourceSongsMap = new HashMap<>();
// 遍历音乐文件列表
for (Map songFileData : datas) {
String sId = songFileData.get("id").toString();
// 原始音乐文件数据对象放入一个临时的 Map 中
sourceSongsMap.put(sId, songFileData);
}
// 再次遍历歌单中的歌曲,填入音乐文件URL
for (Song song : songs) {
String sId = song.getId();
// 从临时的Map中取得对应的音乐文件源数据,使用id直接获取,比较方便
Map songFileData = sourceSongsMap.get(sId);
// 源音乐文件数据中,url 字段就是文件地址
if (songFileData != null && songFileData.get("url") != null) {
String songFileUrl = songFileData.get("url").toString();
song.setSourceUrl(songFileUrl);
}
}
}
private String buildManyIdParam(List<Song> songs) {
// 收集一个歌单中所有歌曲的id,放入一个list
List<String> songIds = new ArrayList<>();
for (Song song : songs) {
songIds.add(song.getId());
}
// 一个歌单中所有歌曲的id,组装成用逗号分割的字符串,形如:347230,347231。记住这个用法,很方便
String sIdsParam = String.join(",", songIds);
return sIdsParam;
}
private List<Comment> getComments(List<Map> rawData) {
List<Comment> comments = new ArrayList<>();
for (Map map : rawData) {
Comment comment = new Comment();
User u = new User();
Map user = (Map) map.get("user");
u.setId( user.get("userId").toString());
u.setNickName(user.get("nickname").toString());
u.setAvatar( user.get("avatarUrl").toString());
comment.setCommentUser(u);
comment.setContent((String) map.get("content"));
comment.setId(map.get("commentId").toString());
comment.setLikedCount( map.get("likedCount").toString());
comment.setTime(map.get("time").toString());
comments.add(comment);
}
return comments;
}
private void generateWordCloud(String artistId) {
Artist artist = getArtist(artistId);
List<Song> songs = artist.getSongList();
List<String> contents = new ArrayList<>();
for (Song song : songs) {
// 遍历歌曲所有的评论,包括普通评论和热门评论,把评论内容字符串存入 contents 集合
contents.add(getCommentContents(song.getHotComments()));
contents.add(getCommentContents(song.getComments()));
}
// 调用方法,制作词云
WordCloudUtil.generate(artistId, contents);
}
private String getCommentContents(List<Comment> comments){
String contents = "";
for (Comment comment:comments){
contents += comment.getContent();
}
System.out.println(contents);
return contents;
}
}
3、设计一个制作词云的工具类util
package com.youkeda.music.util;
import com.kennycason.kumo.CollisionMode;
import com.kennycason.kumo.WordCloud;
import com.kennycason.kumo.WordFrequency;
import com.kennycason.kumo.bg.CircleBackground;
import com.kennycason.kumo.font.FontWeight;
import com.kennycason.kumo.font.KumoFont;
import com.kennycason.kumo.font.scale.SqrtFontScalar;
import com.kennycason.kumo.nlp.FrequencyAnalyzer;
import com.kennycason.kumo.nlp.tokenizers.ChineseWordTokenizer;
import com.kennycason.kumo.palette.ColorPalette;
import java.awt.Color;
import java.awt.Dimension;
import java.time.LocalDateTime;
import java.time.ZoneOffset;
import java.util.List;
/**
* 生成图云的工具类
*/
public class WordCloudUtil {
/**
* 生成词云
*
* @param artistId 歌单id
* @param texts 文本
*/
public static void generate(String artistId, List<String> texts) {
FrequencyAnalyzer frequencyAnalyzer = new FrequencyAnalyzer();
//设置返回的词数
frequencyAnalyzer.setWordFrequenciesToReturn(500);
//设置返回的词语最小出现频次
frequencyAnalyzer.setMinWordLength(4);
//引入中文解析器
frequencyAnalyzer.setWordTokenizer(new ChineseWordTokenizer());
//输入文章数据,进行分词
final List<WordFrequency> wordFrequencyList = frequencyAnalyzer.load(texts);
//设置图片分辨率大小
Dimension dimension = new Dimension(600, 600);
//此处的设置采用内置常量即可,生成词云对象
WordCloud wordCloud = new WordCloud(dimension, CollisionMode.PIXEL_PERFECT);
//设置边界及字体
wordCloud.setPadding(2);
// 设置字体,字体必须支持中文,不能随便改
wordCloud.setKumoFont(new KumoFont("阿里巴巴普惠体 Light", FontWeight.PLAIN));
//设置词云显示的三种颜色,越靠前设置表示词频越高的词语的颜色
wordCloud.setColorPalette(
new ColorPalette(new Color(0x4055F1), new Color(0x408DF1), new Color(0x40AAF1),
new Color(0x40C5F1), new Color(0x40D3F1), new Color(0xFFFFFF)));
wordCloud.setFontScalar(new SqrtFontScalar(10, 70));
//设置背景图层为圆形
wordCloud.setBackground(new CircleBackground(300));
//生成词云
wordCloud.build(wordFrequencyList);
//输出到图片文件,用当前的毫秒数作为文件名
Long milliSecond = LocalDateTime.now().toInstant(ZoneOffset.of("+8")).toEpochMilli();
//输出到图片文件
wordCloud.writeToFile("wordCloud-" + artistId + ".png");
}
}
4、最后测试类
package com.youkeda.music.test;
import com.youkeda.music.model.Artist;
import com.youkeda.music.model.Song;
import com.youkeda.music.service.SongCrawlerService;
import com.youkeda.music.service.impl.SongCrawlerServiceImpl;
/**
* 检查服务是否可以正确返回对象
*/
public class SongCrawlerTest {
private static final String SA_DING_DING = "萨顶顶";
private static final String A_ID = "9270";
private static final String ZUO_SHOU_ZHI_YUE = "左手指月";
private static final String S_ID = "536096151";
public static void main(String[] args) {
SongCrawlerService songService = new SongCrawlerServiceImpl();
songService.start(A_ID);
Artist artist = songService.getArtist(A_ID);
System.out.println("歌单名称:" + artist.getName());
if (!SA_DING_DING.equals(artist.getName())) {
System.out.println("歌单名称错误,不是本测试用例指定的歌单。");
System.exit(1);
}
Song song = songService.getSong(A_ID, S_ID);
System.out.println("歌曲名称:" + song.getName());
if (!ZUO_SHOU_ZHI_YUE.equals(song.getName())) {
System.out.println("歌曲名称错误,不是本测试用例指定的歌曲。");
System.exit(1);
}
if (!SA_DING_DING.equals(song.getSingers().get(0).getNickName())) {
System.out.println("歌曲名称错误,不是本测试用例指定的歌曲。");
System.exit(1);
}
if (!"香蜜沉沉烬如霜 电视原声音乐专辑".equals(song.getAlbum().getName())) {
System.out.println("专辑名称错误,不是本测试用例指定的歌曲的专辑。");
System.exit(1);
}
if (song.getSourceUrl() == null) {
System.out.println("歌曲名称错误,不是本测试用例指定的歌曲。");
System.exit(1);
}
if (song.getHotComments() == null || song.getHotComments().isEmpty()) {
System.out.println("歌曲热门评论错误,没有正确抓取评论数据。");
System.exit(1);
}
System.out.println("歌曲所属专辑名称:" + song.getAlbum().getName());
System.out.println("歌曲的歌手名称:" + song.getSingers().get(0).getNickName());
System.out.println("歌曲音乐为文件地址:" + song.getSourceUrl());
System.out.println("歌曲热门评论:" + song.getHotComments().get(0).getContent());
System.out.println("歌曲服务运行成功。非常棒!");
System.exit(0);
}
}
运行成功就大功告成啦!!啊哈哈哈——出来吧!词云!
三、总结
也不知为什么,会写这一篇文章,(这是我刚做完高数作业的时候,突发奇想就来这里码了)可能就是想记录一下自己的小成功,让自己确信自己在进步,让自己小快乐一下,毕竟自己能做个小项目出来,我真的,哈哈哈蛮激动的。再接再厉!希望看到这里的你也能找到属于自己的学习方法,早日从小白晋升为大佬,啊哈哈哈。
第一次csdn创作,求大佬指正!
求多多鼓励支持一下下,奥利给!