最近由于需要使用ElasticSearch进行学习,其中用到了Spring Data ElasticSearch框架,为了准备测试的数据,这里使用了jsoup去爬虫80s电影网站上的电影信息用于测试,目前可以进行基本的信息的爬虫获取。
通过分析可以知道爬虫的入口为:http://80s.la/movie/list/-----p1
那么大概的思路就是,设置一个总页数,去循环遍历每一个页面,然后获取到每一个电影项,然后获取电影的详情连接,从中获取到常规的信息。详情页如下所示:
下面分享下代码:
1、电影信息的实体类,这里的代码是从项目中COPY,所以有些其他的代码:
package com.china.elasticsearch.bean;
import com.china.elasticsearch.constant.MovieConstant;
import org.springframework.data.annotation.Id;
import org.springframework.data.elasticsearch.annotations.Document;
import java.io.Serializable;
import java.util.Date;
/**
* 计划从80s上爬取电影信息
*/
@Document(indexName = MovieConstant.MOVIE_INDEX_NAME, type = MovieConstant.MOVIE_TYPE_NAME)
public class MovieEntity implements Serializable {
/**电影ID*/
@Id
private String movieId;
/**电影名称*/
private String movieName;
/**演员*/
private String actors;
/**类型:战争*/
private String type;
/**地区:大陆*/
private String area;
/**导演*/
private String director;
/**上映日期,暂时无法获取*/
private String releaseDate;
/**豆瓣评分*/
private double score;
/**语言*/
private String language;
/**年份*/
private int year;
/**提示*/
private String tip;
/**片长*/
private int minute;
public String getMovieId() {
return movieId;
}
public void setMovieId(String movieId) {
this.movieId = movieId;
}
public String getMovieName() {
return movieName;
}
public void setMovieName(String movieName) {
this.movieName = movieName;
}
public String getActors() {
return actors;
}
public void setActors(String actors) {
this.actors = actors;
}
public String getType() {
return type;
}
public void setType(String type) {
this.type = type;
}
public String getArea() {
return area;
}
public void setArea(String area) {
this.area = area;
}
public String getDirector() {
return director;
}
public void setDirector(String director) {
this.director = director;
}
public String getReleaseDate() {
return releaseDate;
}
public void setReleaseDate(String releaseDate) {
this.releaseDate = releaseDate;
}
public double getScore() {
return score;
}
public void setScore(double score) {
this.score = score;
}
public String getLanguage() {
return language;
}
public void setLanguage(String language) {
this.language = language;
}
public int getYear() {
return year;
}
public void setYear(int year) {
this.year = year;
}
public String getTip() {
return tip;
}
public void setTip(String tip) {
this.tip = tip;
}
public int getMinute() {
return minute;
}
public void setMinute(int minute) {
this.minute = minute;
}
}
2、80s爬虫的基本工具类:
package com.china.elasticsearch.util;
import com.china.elasticsearch.bean.MovieEntity;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.springframework.util.StringUtils;
import java.util.ArrayList;
import java.util.List;
/**
* 80s电影信息的基本爬虫的工具类
* @date 2019-08-18
*/
public class MovieDownloadUtil {
public static final String ROOT_URL = "http://80s.la";
public static final String BASIC_URL = ROOT_URL + "/movie/list/-----p";
public static final String CSS_PATH = "#body div#block1.clearfix.noborder ul.me1.clearfix li";
public static final String DEATAIL_CSS_PATH = "#body div#block1.clearfix div#minfo.clearfix div.info";
public static final String USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:61.0) Gecko/20100101 Firefox/61.0";
public static final int TOTAL_PAGE = 20;
public static void main(String[] args){
startGetMovies();
}
public static List<MovieEntity> startGetMovies(){
List<MovieEntity> movieList = new ArrayList<MovieEntity>();
try{
//遍历每一个页面,每个页面大概25个电影
for(int i = 0;i < TOTAL_PAGE;i++){
System.out.println("----------开始爬取第" + (i + 1) + "页数据");
Document document = Jsoup.connect(BASIC_URL + (i + 1)).userAgent(USER_AGENT).get();
Elements movies = document.select(CSS_PATH);
if(movies != null && movies.size() > 0) {
movieList.addAll(getMovieList(movies));
}
}
}catch(Exception e){
e.printStackTrace();
}
return movieList;
}
private static List<MovieEntity> getMovieList(Elements movies) throws Exception {
List<MovieEntity> movieList = new ArrayList<MovieEntity>();
for(Element element : movies){
MovieEntity entity = new MovieEntity();
Element aEle = element.select("h3.h3 a").get(0);
String movieName = aEle.text();//电影名称
String href = aEle.attr("href");
String movieId = href.substring(href.lastIndexOf("/") + 1,href.length());//电影Id
Document detailDocument = Jsoup.connect(ROOT_URL + href).userAgent(USER_AGENT).get();
Element infoEle = detailDocument.select(DEATAIL_CSS_PATH).get(0);
String deMovieName = infoEle.selectFirst("h1.font14w").text();
String year = deMovieName.substring(deMovieName.indexOf("(") + 1,deMovieName.lastIndexOf(")"));//年份
if("未知".equals(year)){
year = "0";
}
String tip = infoEle.selectFirst(".tip") != null ? infoEle.selectFirst(".tip").text() :"";//版本提示
Element firstClearfixDiv = infoEle.select("div.clearfix").get(0);
try{
//设置基本类型的信息
setTypeInfo(firstClearfixDiv,entity);
}catch (Exception e){
System.out.println(movieName);
}
//获取评分
String scoreText = firstClearfixDiv.nextElementSibling().child(0).text();
if(scoreText != null && scoreText.contains("豆瓣评分")){
scoreText = scoreText.replace("豆瓣评分:","").trim();
}else{
if(!scoreText.contains("评论")){
System.out.println(movieName);
System.out.println(scoreText);
}
scoreText = "0";
}
entity.setMovieId(movieId);
entity.setMovieName(movieName);
entity.setYear(Integer.parseInt(year));
entity.setTip(tip);
entity.setScore(Double.parseDouble(scoreText));
movieList.add(entity);
}
return movieList;
}
private static void setTypeInfo(Element firstClearfixDiv,MovieEntity entity) {
Elements spans = firstClearfixDiv.select(">span");
for(Element span : spans){
String flag = span.select(".font_888").get(0).text();
Elements actors = span.select("a");
if("演员:".equals(flag)){
String actor = getActorAndType(actors);
entity.setActors(actor);
}
if("类型:".equals(flag)){
String type = getActorAndType(actors);
entity.setType(type);
}
if("地区:".equals(flag)){
String area = getActorAndType(actors);
entity.setArea(area);
}
if("语言:".equals(flag)){
String language = getActorAndType(actors);
entity.setLanguage(language);
}
if("导演:".equals(flag)){
String director = getActorAndType(actors);
entity.setDirector(director);
}
if("片长:".equals(flag)){
String minute = span.text().replace("片长:","")
.replace("分钟","")
.replace(" India: ","")
.replace(" Hong Kong: ","")
.replace(" France: ","")
.replace(" USA: ","")
.replace(" UK: ","")
.replace("min","")
.replace("(台湾)","")
.replace("中国大陆)","")
.replace("(美国/中国大陆)","")
.replace(" Argentina: ","")
.replace(" Japan: ","").trim();
entity.setMinute(Integer.parseInt(minute));
}
}
}
private static String getActorAndType(Elements as) {
List<String> actorList = new ArrayList<String>();
for (Element actor : as) {
actorList.add(actor.text());
}
String actorsStr = StringUtils.collectionToDelimitedString(actorList, " ");
return actorsStr;
}
}
目前代码只是简单的实现了功能,后续将会进行优化。目前是针对Spring Data ElasticSearch这个目标进行的。
具体可以浏览我的Github开源的项目,目前刚刚开始做,可以实现了Spring Data ElasticSearch分页查询数据、爬虫80s上的电影信息等,前端目前使用Vue界面,目前是每周进行更新。
Github地址:https://github.com/wuchubuzai2018/elasticsearch-study