springboot+jsoup抓取新闻网站信息

2 篇文章 0 订阅

步骤:

(1)根据url抓取html页面
(2)对html页面进行解析,获取该页面所有的数据,保存到数据库中(mongodb)
(3)遍历所有的数据,更新详情数据

一、导入jar包
 <!--httpclient-->
        <dependency>
            <groupId>org.apache.httpcomponents</groupId>
            <artifactId>httpclient</artifactId>
            <version>4.5.2</version>
        </dependency>
        <!--htmlunit-->
        <dependency>
            <groupId>net.sourceforge.htmlunit</groupId>
            <artifactId>htmlunit</artifactId>
            <version>2.33</version>
        </dependency>
        <dependency>
            <groupId>net.sf.json-lib</groupId>
            <artifactId>json-lib</artifactId>
            <version>2.4</version>
            <classifier>jdk15</classifier>
        </dependency>
        <dependency>
        <groupId>com.googlecode.juniversalchardet</groupId>
            <artifactId>juniversalchardet</artifactId>
            <version>1.0.3</version>
        </dependency>
        <dependency>
            <groupId>org.jsoup</groupId>
            <artifactId>jsoup</artifactId>
            <version>1.10.3</version>
        </dependency>
        <dependency>
            <groupId>org.springframework.boot</groupId>
            <artifactId>spring-boot-starter-data-mongodb</artifactId>
        </dependency>
        <dependency>
            <groupId>org.mongodb</groupId>
            <artifactId>mongo-java-driver</artifactId>
            <version>3.0.4</version>
        </dependency>
二、解析凤凰网新闻
  /**
     * 爬虫凤凰网军事热点新闻列表
     */
    @GetMapping("/saveNewStream")
    public void saveNewStream(String url) throws IOException {
        log.info("程序开始...");
        long startTime = new Date().getTime();
        //1.获取动态js页面内容
        Document document = HtmlUtils.getHtmlunit(url);
        //2.获取script里html内容
        String html = document.getElementsByTag("script").get(2).html();
        html = html.replace("//<![CDATA[","");
        html = html.replace("//]]>","");
        String[] data = html.split("var");
        String sp = "allData =";
        //3.获取json数据
        List<NewsStream> newsStreamArrayList = new ArrayList<>();
        for(String variable : data){
            if (variable.contains(sp)){
                variable = variable.replace(sp, "").trim();
                variable = variable.substring(0, variable.length()-1);
                JSONObject jsonObject = JSONObject.parseObject(variable);
                //4.获取所需新闻列表
                JSONArray newsstream = jsonObject.getJSONArray("newsstream");
                // jsonArray转换为List对象
                List<NewsStream> newsStreams = JSONArray.parseArray(newsstream.toString(), NewsStream.class);
                newsStreamArrayList.addAll(newsStreams);
                //5.递归求下页数据
                List<NewsStream> listThree = HtmlUtils.buildTree(newsStreams);
                newsStreamArrayList.addAll(listThree);
                //6.保存数据
                int i = newsStreamService.saveNewsStream(newsStreamArrayList);
            }
        }
        //获取新闻详情数据
        newsStreamArrayList.forEach(n->{
            NewsStream newsStream = new NewsStream();
            String  articleUrl = n.getUrl();
            newsStream.setId(n.getId());
            Document doc = null;
            //3.获取动态js页面内容
            try {
                String htmlInfo = Requests.get(articleUrl);
                doc = Jsoup.parse(htmlInfo);
            }catch (MalformedURLException e){
                e.printStackTrace();
            }
            //详情列表 标题+内容+图片
            Elements elements = doc.select("div[id=root]").select("div[class=artical-25JfwmD5]").select("div[class=artical-25JfwmD5]");
            //来源时间
            Elements span = elements.select("div[class=info-3Ht6Fk1n clearfix]").select("span");
            //新闻发布时间
            String time = span.first().text();
            newsStream.setNewsTime(time);
            //来源
            String source = span.select("a[href]").text();
            newsStream.setSource(source);
            //内容+图片
            Elements contentImg = elements.select("div[class=main_content-r5RGqegj]").select("div[class=text-3w2e3DBc]");
            //内容
            String content = contentImg.text();
            newsStream.setContent(content);
            //图片
            Elements p = contentImg.select("p");
            List list = new ArrayList();
            int i = 0;
            Elements img = p.select("[src]");
            for (Element element : img) {
                String srcUrl = element.attr("src");//获取到src的值
                list.add(i,srcUrl);
                i++;
            }
            newsStream.setImages(list);
            //根据id修改mongodb数据库信息
            newsStreamService.updateNewsStream(newsStream);
        });
        long endTime = new Date().getTime();
        log.info("********本程序运行 " + (endTime - startTime) + " 毫秒完成***********");
    }
jsoup获取动态生成的js内容
@Slf4j
public class HtmlUtils {

    /**
     * jsoup获取动态生成的js内容
     * @param url
     * @return
     * @throws IOException
     */
    public static Document getHtmlunit(String url) throws IOException {
        //新建一个模拟谷歌Chrome浏览器的浏览器客户端对象
        final WebClient webClient = new WebClient(BrowserVersion.CHROME);
        //当JS执行出错的时候是否抛出异常, 这里选择不需要
        webClient.getOptions().setThrowExceptionOnScriptError(false);
        //当HTTP的状态非200时是否抛出异常, 这里选择不需要
        webClient.getOptions().setThrowExceptionOnFailingStatusCode(false);
        webClient.getOptions().setActiveXNative(false);
        //是否启用CSS, 因为不需要展现页面, 所以不需要启用
        webClient.getOptions().setCssEnabled(false);
        //很重要,启用JS
        webClient.getOptions().setJavaScriptEnabled(true);
        //很重要,设置支持AJAX
        webClient.setAjaxController(new NicelyResynchronizingAjaxController());
        HtmlPage page = null;
        try {
            //尝试加载上面图片例子给出的网页
            page = webClient.getPage(url);
        } catch (Exception e) {
            e.printStackTrace();
        }finally {
            webClient.close();
        }
        //异步JS执行需要耗时,所以这里线程要阻塞30秒,等待异步JS执行结束
        webClient.waitForBackgroundJavaScript(30000);
        //直接将加载完成的页面转换成xml格式的字符串
        String pageXml = page.asXml();
        // 下面的代码就是对字符串的操作了,常规的爬虫操作,用到了比较好用的Jsoup库
        Document doc = Jsoup.parse(pageXml);//获取html文档
        return doc;
    }

    //下拉加载请求的url
    private static String getViewUrl = "http://shankapi.ifeng.com/shanklist/_/getColumnInfo/_/default/";
    //请求参数
    private static String callback = "getColumnInfoCallback";

    /**
     * 页面拉取查看更多
     */
    public static String getViewMore(String id,String nesTime,Long nowTime) throws MalformedURLException {
        StringBuffer viewUrl = new StringBuffer();
        viewUrl.append(getViewUrl).append(id).append("/").append(nesTime).append("/20/14-35083-/").append(callback).append(
                "?callback=").append(callback).append("&_").append(nowTime);
        String viewUrlInfo = Requests.get(viewUrl.toString());
        return viewUrlInfo;
    }

    /**
     * 新闻循环取每一页新闻列表
     */
    public static List<NewsStream> buildTree(List<NewsStream> list){
        List<NewsStream> newsStreamList = new ArrayList<>();
        list.forEach(n->{
            if (n.equals(list.get(list.size() - 1))) {
                String viewMore = null;
                //加载全部列表
                //拼接请求url:需要的新闻发布时间时间戳
                String nesTime = DateUtils.date2TimeStamp(n.getNewsTime(), "yyyy-MM-dd HH:mm:ss");
                try {
                    viewMore = HtmlUtils.getViewMore(n.getId(), nesTime, new Date().getTime());
                    viewMore = viewMore.replace("getColumnInfoCallback(", "").trim();
                    viewMore = viewMore.substring(0, viewMore.length()-1);
                } catch (MalformedURLException e) {
                    e.printStackTrace();
                }
                //加载view转json
                JSONObject view = JSONObject.parseObject(viewMore);
                //取data信息
                String image = view.getString("data");
                JSONObject object = JSONObject.parseObject(image);
                String newsstreamList = object.getString("newsstream");
                JSONArray jsonArrayNewsstreamList = JSONArray.parseArray(newsstreamList);
                // jsonArray转换为List对象
                List<NewsStream> newsStreams = JSONArray.parseArray(jsonArrayNewsstreamList.toString(), NewsStream.class);
                newsStreamList.addAll(newsStreams);
                //递归寻找下一list
                List<NewsStream> streams = buildTree(newsStreams);
                newsStreamList.addAll(streams);
            }
        });
        return newsStreamList;
    }

service
package com.ddtj.crawl.service;
import com.ddtj.crawl.domain.NewsStream;
import org.springframework.data.mongodb.core.query.Query;
import java.util.List;
public interface NewsStreamService {

    /**
     * 保存数据
     * @author
     */
    public int saveNewsStream(List<NewsStream> newsStream);

    /**
     * 修改数据
     * @param newsStream
     * @return
     */
    public void updateNewsStream(NewsStream newsStream);

}
serviceImpl
package com.ddtj.crawl.service.impl;
import com.ddtj.crawl.domain.NewsStream;
import com.ddtj.crawl.service.NewsStreamService;
import com.ddtj.crawl.utils.MongoUtil;
import com.ddtj.crawl.utils.PageHelper;
import com.mongodb.client.result.UpdateResult;
import lombok.extern.slf4j.Slf4j;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.data.domain.*;
import org.springframework.data.mongodb.core.MongoTemplate;
import org.springframework.data.mongodb.core.query.Criteria;
import org.springframework.data.mongodb.core.query.Query;
import org.springframework.data.mongodb.core.query.Update;
import org.springframework.stereotype.Service;
import javax.annotation.Resource;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Pattern;
@Service
@Slf4j
public class NewsStreamServiceImpl implements NewsStreamService {

    @Autowired
    MongoTemplate mongoTemplate;

    /**
     * 保存数据
     * @author
     */
    @Override
    public int saveNewsStream(List<NewsStream> newsStream) {
        log.info("mongodb 数据库插入: "+newsStream.size()+"条数据.........");
        newsStream.forEach(n->{
            //使用 save和insert都可以进行插入
            //区别:当存在"_id"时
            //insert 插入已经存在的id时 会异常
            //save 则会进行更新
            //简单来说 save 就是不存在插入 存在更新
//            NewsStream insert = mongoTemplate.insert(n);
            mongoTemplate.save(n);
        });
        return 0;
    }

    /**
     * 修改数据
     * @param newsStream
     * @return
     */
    @Override
    public void updateNewsStream(NewsStream newsStream) {
        Query query = new Query(Criteria.where("id").is(newsStream.getId()));
        Update update = new Update();
        update.set("source", newsStream.getSource());
        update.set("content", newsStream.getContent());
        update.set("images", newsStream.getImages());
        UpdateResult result = mongoTemplate.updateFirst(query, update, NewsStream.class);
    }

    /**
     * 查看新闻列表
     * @return
     */
    @Override
    public List<NewsStream> getNewsStreamList(NewsStream newsStream,Query query ) {
        //新闻标题模糊
        if (null != newsStream.getTitle()){
            Pattern patternTitle = Pattern.compile("^.*"+newsStream.getTitle()+".*$", Pattern.CASE_INSENSITIVE);
            query.addCriteria(Criteria.where("title").regex(patternTitle));
        }
        //新闻来源模糊
        if (null != newsStream.getSource()){
            Pattern patternSource = Pattern.compile("^.*"+newsStream.getSource()+".*$", Pattern.CASE_INSENSITIVE);
            query.addCriteria(Criteria.where("source").regex(patternSource));
        }
        //新闻url模糊
        if (null != newsStream.getUrl()){
            Pattern patternUrl = Pattern.compile("^.*"+newsStream.getUrl()+".*$", Pattern.CASE_INSENSITIVE);
            query.addCriteria(Criteria.where("url").regex(patternUrl));
        }
        //时间倒序
        query.with(Sort.by(
                    Sort.Order.desc("newsTime")
            ));
        return mongoTemplate.find(query, NewsStream.class);
    }

}
mapper
public interface NewsStreamMapper extends MongoRepository<NewsStream,String> {

}
domian
package com.ddtj.crawl.domain;
import com.ddtj.common.core.annotation.Excel;
import com.ddtj.common.core.web.domain.BaseEntity;
import groovy.transform.builder.Builder;
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.NoArgsConstructor;
import org.springframework.data.mongodb.core.mapping.Document;
import java.util.List;
@Document(collection="news_stream")//集合名
@Data
@Builder
@NoArgsConstructor
@AllArgsConstructor
public class NewsStream extends BaseEntity{

    //id
    @Excel(name = "新闻ID")
    private String id;

    //标题
    @Excel(name = "标题")
    private String title;

    //新闻时间
    @Excel(name = "新闻时间")
    private String newsTime;

    //来源
    @Excel(name = "来源")
    private String source;

    //详情内容
    @Excel(name = "详情内容")
    private String content;

    //原始缩略图
    @Excel(name = "缩略图")
    private String thumbnails;

    //文章详情图片
    @Excel(name = "文章详情图片")
    private List<String> images;

    //文章详情url
    @Excel(name = "详情URL")
    private String url;


}
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值