基于80s电影下载网的电影信息爬虫

最近由于需要使用ElasticSearch进行学习,其中用到了Spring Data ElasticSearch框架,为了准备测试的数据,这里使用了jsoup去爬虫80s电影网站上的电影信息用于测试,目前可以进行基本的信息的爬虫获取。

通过分析可以知道爬虫的入口为:http://80s.la/movie/list/-----p1

那么大概的思路就是,设置一个总页数,去循环遍历每一个页面,然后获取到每一个电影项,然后获取电影的详情连接,从中获取到常规的信息。详情页如下所示:

下面分享下代码:
1、电影信息的实体类,这里的代码是从项目中COPY,所以有些其他的代码:

package com.china.elasticsearch.bean;

import com.china.elasticsearch.constant.MovieConstant;
import org.springframework.data.annotation.Id;
import org.springframework.data.elasticsearch.annotations.Document;

import java.io.Serializable;
import java.util.Date;

/**
 * 计划从80s上爬取电影信息
 */
@Document(indexName = MovieConstant.MOVIE_INDEX_NAME, type = MovieConstant.MOVIE_TYPE_NAME)
public class MovieEntity implements Serializable {


    /**电影ID*/
    @Id
    private String movieId;

    /**电影名称*/
    private String movieName;

    /**演员*/
    private String actors;

    /**类型:战争*/
    private String type;

    /**地区:大陆*/
    private String area;

    /**导演*/
    private String director;

    /**上映日期,暂时无法获取*/
    private String releaseDate;

    /**豆瓣评分*/
    private double score;

    /**语言*/
    private String language;

    /**年份*/
    private int year;

    /**提示*/
    private String tip;

    /**片长*/
    private int minute;


    public String getMovieId() {
        return movieId;
    }

    public void setMovieId(String movieId) {
        this.movieId = movieId;
    }

    public String getMovieName() {
        return movieName;
    }

    public void setMovieName(String movieName) {
        this.movieName = movieName;
    }

    public String getActors() {
        return actors;
    }

    public void setActors(String actors) {
        this.actors = actors;
    }

    public String getType() {
        return type;
    }

    public void setType(String type) {
        this.type = type;
    }

    public String getArea() {
        return area;
    }

    public void setArea(String area) {
        this.area = area;
    }

    public String getDirector() {
        return director;
    }

    public void setDirector(String director) {
        this.director = director;
    }

    public String getReleaseDate() {
        return releaseDate;
    }

    public void setReleaseDate(String releaseDate) {
        this.releaseDate = releaseDate;
    }

    public double getScore() {
        return score;
    }

    public void setScore(double score) {
        this.score = score;
    }

    public String getLanguage() {
        return language;
    }

    public void setLanguage(String language) {
        this.language = language;
    }

    public int getYear() {
        return year;
    }

    public void setYear(int year) {
        this.year = year;
    }

    public String getTip() {
        return tip;
    }

    public void setTip(String tip) {
        this.tip = tip;
    }

    public int getMinute() {
        return minute;
    }

    public void setMinute(int minute) {
        this.minute = minute;
    }
}

2、80s爬虫的基本工具类:

package com.china.elasticsearch.util;

import com.china.elasticsearch.bean.MovieEntity;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.springframework.util.StringUtils;

import java.util.ArrayList;
import java.util.List;

/**
 * 80s电影信息的基本爬虫的工具类
 * @date 2019-08-18
 */
public class MovieDownloadUtil {

   public static final String ROOT_URL =  "http://80s.la";

    public static final String BASIC_URL = ROOT_URL + "/movie/list/-----p";

    public static final String CSS_PATH = "#body div#block1.clearfix.noborder ul.me1.clearfix li";

    public static final String DEATAIL_CSS_PATH = "#body div#block1.clearfix div#minfo.clearfix div.info";

    public static final String USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:61.0) Gecko/20100101 Firefox/61.0";

    public static final int TOTAL_PAGE = 20;

    public static void main(String[] args){
        startGetMovies();
    }


    public static List<MovieEntity> startGetMovies(){
        List<MovieEntity> movieList = new ArrayList<MovieEntity>();
        try{
            //遍历每一个页面,每个页面大概25个电影
            for(int i = 0;i < TOTAL_PAGE;i++){
                System.out.println("----------开始爬取第" + (i + 1) + "页数据");
                Document document = Jsoup.connect(BASIC_URL + (i + 1)).userAgent(USER_AGENT).get();
                Elements movies = document.select(CSS_PATH);
                if(movies != null && movies.size() > 0) {
                    movieList.addAll(getMovieList(movies));
                }
            }
        }catch(Exception e){
            e.printStackTrace();
        }
        return movieList;
    }


    private static List<MovieEntity> getMovieList(Elements movies) throws Exception {
        List<MovieEntity> movieList = new ArrayList<MovieEntity>();
        for(Element element : movies){
            MovieEntity entity = new MovieEntity();

            Element aEle = element.select("h3.h3 a").get(0);
            String movieName = aEle.text();//电影名称
            String href = aEle.attr("href");
            String movieId = href.substring(href.lastIndexOf("/") + 1,href.length());//电影Id

            Document detailDocument = Jsoup.connect(ROOT_URL + href).userAgent(USER_AGENT).get();
            Element infoEle = detailDocument.select(DEATAIL_CSS_PATH).get(0);
            String deMovieName = infoEle.selectFirst("h1.font14w").text();
            String year = deMovieName.substring(deMovieName.indexOf("(") + 1,deMovieName.lastIndexOf(")"));//年份
            if("未知".equals(year)){
                year = "0";
            }

            String tip = infoEle.selectFirst(".tip") != null ? infoEle.selectFirst(".tip").text() :"";//版本提示

            Element firstClearfixDiv = infoEle.select("div.clearfix").get(0);
            try{
                //设置基本类型的信息
                setTypeInfo(firstClearfixDiv,entity);
            }catch (Exception e){
                System.out.println(movieName);
            }

            //获取评分
            String scoreText = firstClearfixDiv.nextElementSibling().child(0).text();
            if(scoreText != null && scoreText.contains("豆瓣评分")){
                scoreText = scoreText.replace("豆瓣评分:","").trim();
            }else{
                if(!scoreText.contains("评论")){
                    System.out.println(movieName);
                    System.out.println(scoreText);
                }
                scoreText = "0";
            }

            entity.setMovieId(movieId);
            entity.setMovieName(movieName);
            entity.setYear(Integer.parseInt(year));
            entity.setTip(tip);
            entity.setScore(Double.parseDouble(scoreText));

            movieList.add(entity);
        }
        return movieList;
    }


    private static void setTypeInfo(Element firstClearfixDiv,MovieEntity entity) {
        Elements spans = firstClearfixDiv.select(">span");
        for(Element span : spans){
            String flag = span.select(".font_888").get(0).text();
            Elements actors = span.select("a");

            if("演员:".equals(flag)){
                String actor = getActorAndType(actors);
                entity.setActors(actor);
            }
            if("类型:".equals(flag)){
                String type = getActorAndType(actors);
                entity.setType(type);
            }
            if("地区:".equals(flag)){
                String area = getActorAndType(actors);
                entity.setArea(area);
            }
            if("语言:".equals(flag)){
                String language = getActorAndType(actors);
                entity.setLanguage(language);
            }
            if("导演:".equals(flag)){
                String director = getActorAndType(actors);
                entity.setDirector(director);
            }
            if("片长:".equals(flag)){
                String minute = span.text().replace("片长:","")
                        .replace("分钟","")
                        .replace(" India: ","")
                        .replace(" Hong Kong: ","")
                        .replace(" France: ","")
                        .replace(" USA: ","")
                        .replace(" UK: ","")
                        .replace("min","")
                        .replace("(台湾)","")
                        .replace("中国大陆)","")
                        .replace("(美国/中国大陆)","")
                        .replace(" Argentina: ","")
                        .replace(" Japan: ","").trim();

                entity.setMinute(Integer.parseInt(minute));
            }
        }
    }

    private static String getActorAndType(Elements as) {
        List<String> actorList = new ArrayList<String>();
        for (Element actor : as) {
            actorList.add(actor.text());
        }
        String actorsStr = StringUtils.collectionToDelimitedString(actorList, " ");
        return actorsStr;
    }


}

目前代码只是简单的实现了功能,后续将会进行优化。目前是针对Spring Data ElasticSearch这个目标进行的。

具体可以浏览我的Github开源的项目,目前刚刚开始做,可以实现了Spring Data ElasticSearch分页查询数据、爬虫80s上的电影信息等,前端目前使用Vue界面,目前是每周进行更新。

Github地址:https://github.com/wuchubuzai2018/elasticsearch-study

码云地址:https://gitee.com/wuchubuzai/elasticsearch-study

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值