springboot+jsoup抓取新闻网站信息
步骤:
(1)根据url抓取html页面
(2)对html页面进行解析,获取该页面所有的数据,保存到数据库中(mongodb)
(3)遍历所有的数据,更新详情数据
一、导入jar包
<!--httpclient-->
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.5.2</version>
</dependency>
<!--htmlunit-->
<dependency>
<groupId>net.sourceforge.htmlunit</groupId>
<artifactId>htmlunit</artifactId>
<version>2.33</version>
</dependency>
<dependency>
<groupId>net.sf.json-lib</groupId>
<artifactId>json-lib</artifactId>
<version>2.4</version>
<classifier>jdk15</classifier>
</dependency>
<dependency>
<groupId>com.googlecode.juniversalchardet</groupId>
<artifactId>juniversalchardet</artifactId>
<version>1.0.3</version>
</dependency>
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.10.3</version>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-data-mongodb</artifactId>
</dependency>
<dependency>
<groupId>org.mongodb</groupId>
<artifactId>mongo-java-driver</artifactId>
<version>3.0.4</version>
</dependency>
二、解析凤凰网新闻
/**
* 爬虫凤凰网军事热点新闻列表
*/
@GetMapping("/saveNewStream")
public void saveNewStream(String url) throws IOException {
log.info("程序开始...");
long startTime = new Date().getTime();
//1.获取动态js页面内容
Document document = HtmlUtils.getHtmlunit(url);
//2.获取script里html内容
String html = document.getElementsByTag("script").get(2).html();
html = html.replace("//<![CDATA[","");
html = html.replace("//]]>","");
String[] data = html.split("var");
String sp = "allData =";
//3.获取json数据
List<NewsStream> newsStreamArrayList = new ArrayList<>();
for(String variable : data){
if (variable.contains(sp)){
variable = variable.replace(sp, "").trim();
variable = variable.substring(0, variable.length()-1);
JSONObject jsonObject = JSONObject.parseObject(variable);
//4.获取所需新闻列表
JSONArray newsstream = jsonObject.getJSONArray("newsstream");
// jsonArray转换为List对象
List<NewsStream> newsStreams = JSONArray.parseArray(newsstream.toString(), NewsStream.class);
newsStreamArrayList.addAll(newsStreams);
//5.递归求下页数据
List<NewsStream> listThree = HtmlUtils.buildTree(newsStreams);
newsStreamArrayList.addAll(listThree);
//6.保存数据
int i = newsStreamService.saveNewsStream(newsStreamArrayList);
}
}
//获取新闻详情数据
newsStreamArrayList.forEach(n->{
NewsStream newsStream = new NewsStream();
String articleUrl = n.getUrl();
newsStream.setId(n.getId());
Document doc = null;
//3.获取动态js页面内容
try {
String htmlInfo = Requests.get(articleUrl);
doc = Jsoup.parse(htmlInfo);
}catch (MalformedURLException e){
e.printStackTrace();
}
//详情列表 标题+内容+图片
Elements elements = doc.select("div[id=root]").select("div[class=artical-25JfwmD5]").select("div[class=artical-25JfwmD5]");
//来源时间
Elements span = elements.select("div[class=info-3Ht6Fk1n clearfix]").select("span");
//新闻发布时间
String time = span.first().text();
newsStream.setNewsTime(time);
//来源
String source = span.select("a[href]").text();
newsStream.setSource(source);
//内容+图片
Elements contentImg = elements.select("div[class=main_content-r5RGqegj]").select("div[class=text-3w2e3DBc]");
//内容
String content = contentImg.text();
newsStream.setContent(content);
//图片
Elements p = contentImg.select("p");
List list = new ArrayList();
int i = 0;
Elements img = p.select("[src]");
for (Element element : img) {
String srcUrl = element.attr("src");//获取到src的值
list.add(i,srcUrl);
i++;
}
newsStream.setImages(list);
//根据id修改mongodb数据库信息
newsStreamService.updateNewsStream(newsStream);
});
long endTime = new Date().getTime();
log.info("********本程序运行 " + (endTime - startTime) + " 毫秒完成***********");
}
jsoup获取动态生成的js内容
@Slf4j
public class HtmlUtils {
/**
* jsoup获取动态生成的js内容
* @param url
* @return
* @throws IOException
*/
public static Document getHtmlunit(String url) throws IOException {
//新建一个模拟谷歌Chrome浏览器的浏览器客户端对象
final WebClient webClient = new WebClient(BrowserVersion.CHROME);
//当JS执行出错的时候是否抛出异常, 这里选择不需要
webClient.getOptions().setThrowExceptionOnScriptError(false);
//当HTTP的状态非200时是否抛出异常, 这里选择不需要
webClient.getOptions().setThrowExceptionOnFailingStatusCode(false);
webClient.getOptions().setActiveXNative(false);
//是否启用CSS, 因为不需要展现页面, 所以不需要启用
webClient.getOptions().setCssEnabled(false);
//很重要,启用JS
webClient.getOptions().setJavaScriptEnabled(true);
//很重要,设置支持AJAX
webClient.setAjaxController(new NicelyResynchronizingAjaxController());
HtmlPage page = null;
try {
//尝试加载上面图片例子给出的网页
page = webClient.getPage(url);
} catch (Exception e) {
e.printStackTrace();
}finally {
webClient.close();
}
//异步JS执行需要耗时,所以这里线程要阻塞30秒,等待异步JS执行结束
webClient.waitForBackgroundJavaScript(30000);
//直接将加载完成的页面转换成xml格式的字符串
String pageXml = page.asXml();
// 下面的代码就是对字符串的操作了,常规的爬虫操作,用到了比较好用的Jsoup库
Document doc = Jsoup.parse(pageXml);//获取html文档
return doc;
}
//下拉加载请求的url
private static String getViewUrl = "http://shankapi.ifeng.com/shanklist/_/getColumnInfo/_/default/";
//请求参数
private static String callback = "getColumnInfoCallback";
/**
* 页面拉取查看更多
*/
public static String getViewMore(String id,String nesTime,Long nowTime) throws MalformedURLException {
StringBuffer viewUrl = new StringBuffer();
viewUrl.append(getViewUrl).append(id).append("/").append(nesTime).append("/20/14-35083-/").append(callback).append(
"?callback=").append(callback).append("&_").append(nowTime);
String viewUrlInfo = Requests.get(viewUrl.toString());
return viewUrlInfo;
}
/**
* 新闻循环取每一页新闻列表
*/
public static List<NewsStream> buildTree(List<NewsStream> list){
List<NewsStream> newsStreamList = new ArrayList<>();
list.forEach(n->{
if (n.equals(list.get(list.size() - 1))) {
String viewMore = null;
//加载全部列表
//拼接请求url:需要的新闻发布时间时间戳
String nesTime = DateUtils.date2TimeStamp(n.getNewsTime(), "yyyy-MM-dd HH:mm:ss");
try {
viewMore = HtmlUtils.getViewMore(n.getId(), nesTime, new Date().getTime());
viewMore = viewMore.replace("getColumnInfoCallback(", "").trim();
viewMore = viewMore.substring(0, viewMore.length()-1);
} catch (MalformedURLException e) {
e.printStackTrace();
}
//加载view转json
JSONObject view = JSONObject.parseObject(viewMore);
//取data信息
String image = view.getString("data");
JSONObject object = JSONObject.parseObject(image);
String newsstreamList = object.getString("newsstream");
JSONArray jsonArrayNewsstreamList = JSONArray.parseArray(newsstreamList);
// jsonArray转换为List对象
List<NewsStream> newsStreams = JSONArray.parseArray(jsonArrayNewsstreamList.toString(), NewsStream.class);
newsStreamList.addAll(newsStreams);
//递归寻找下一list
List<NewsStream> streams = buildTree(newsStreams);
newsStreamList.addAll(streams);
}
});
return newsStreamList;
}
service
package com.ddtj.crawl.service;
import com.ddtj.crawl.domain.NewsStream;
import org.springframework.data.mongodb.core.query.Query;
import java.util.List;
public interface NewsStreamService {
/**
* 保存数据
* @author
*/
public int saveNewsStream(List<NewsStream> newsStream);
/**
* 修改数据
* @param newsStream
* @return
*/
public void updateNewsStream(NewsStream newsStream);
}
serviceImpl
package com.ddtj.crawl.service.impl;
import com.ddtj.crawl.domain.NewsStream;
import com.ddtj.crawl.service.NewsStreamService;
import com.ddtj.crawl.utils.MongoUtil;
import com.ddtj.crawl.utils.PageHelper;
import com.mongodb.client.result.UpdateResult;
import lombok.extern.slf4j.Slf4j;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.data.domain.*;
import org.springframework.data.mongodb.core.MongoTemplate;
import org.springframework.data.mongodb.core.query.Criteria;
import org.springframework.data.mongodb.core.query.Query;
import org.springframework.data.mongodb.core.query.Update;
import org.springframework.stereotype.Service;
import javax.annotation.Resource;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Pattern;
@Service
@Slf4j
public class NewsStreamServiceImpl implements NewsStreamService {
@Autowired
MongoTemplate mongoTemplate;
/**
* 保存数据
* @author
*/
@Override
public int saveNewsStream(List<NewsStream> newsStream) {
log.info("mongodb 数据库插入: "+newsStream.size()+"条数据.........");
newsStream.forEach(n->{
//使用 save和insert都可以进行插入
//区别:当存在"_id"时
//insert 插入已经存在的id时 会异常
//save 则会进行更新
//简单来说 save 就是不存在插入 存在更新
// NewsStream insert = mongoTemplate.insert(n);
mongoTemplate.save(n);
});
return 0;
}
/**
* 修改数据
* @param newsStream
* @return
*/
@Override
public void updateNewsStream(NewsStream newsStream) {
Query query = new Query(Criteria.where("id").is(newsStream.getId()));
Update update = new Update();
update.set("source", newsStream.getSource());
update.set("content", newsStream.getContent());
update.set("images", newsStream.getImages());
UpdateResult result = mongoTemplate.updateFirst(query, update, NewsStream.class);
}
/**
* 查看新闻列表
* @return
*/
@Override
public List<NewsStream> getNewsStreamList(NewsStream newsStream,Query query ) {
//新闻标题模糊
if (null != newsStream.getTitle()){
Pattern patternTitle = Pattern.compile("^.*"+newsStream.getTitle()+".*$", Pattern.CASE_INSENSITIVE);
query.addCriteria(Criteria.where("title").regex(patternTitle));
}
//新闻来源模糊
if (null != newsStream.getSource()){
Pattern patternSource = Pattern.compile("^.*"+newsStream.getSource()+".*$", Pattern.CASE_INSENSITIVE);
query.addCriteria(Criteria.where("source").regex(patternSource));
}
//新闻url模糊
if (null != newsStream.getUrl()){
Pattern patternUrl = Pattern.compile("^.*"+newsStream.getUrl()+".*$", Pattern.CASE_INSENSITIVE);
query.addCriteria(Criteria.where("url").regex(patternUrl));
}
//时间倒序
query.with(Sort.by(
Sort.Order.desc("newsTime")
));
return mongoTemplate.find(query, NewsStream.class);
}
}
mapper
public interface NewsStreamMapper extends MongoRepository<NewsStream,String> {
}
domian
package com.ddtj.crawl.domain;
import com.ddtj.common.core.annotation.Excel;
import com.ddtj.common.core.web.domain.BaseEntity;
import groovy.transform.builder.Builder;
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.NoArgsConstructor;
import org.springframework.data.mongodb.core.mapping.Document;
import java.util.List;
@Document(collection="news_stream")//集合名
@Data
@Builder
@NoArgsConstructor
@AllArgsConstructor
public class NewsStream extends BaseEntity{
//id
@Excel(name = "新闻ID")
private String id;
//标题
@Excel(name = "标题")
private String title;
//新闻时间
@Excel(name = "新闻时间")
private String newsTime;
//来源
@Excel(name = "来源")
private String source;
//详情内容
@Excel(name = "详情内容")
private String content;
//原始缩略图
@Excel(name = "缩略图")
private String thumbnails;
//文章详情图片
@Excel(name = "文章详情图片")
private List<String> images;
//文章详情url
@Excel(name = "详情URL")
private String url;
}