本人原创,转发请注明地址 https://blog.csdn.net/weixin_41442935/article/details/97908344
最近做了一个爬取网站数据,主要是爬取新闻方面.
首先导包,web方面的自行百度,这里用到的是jsoup的包进行爬取
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.11.3</version>
</dependency>
创建实体类
public class JsoupNewsEntity {
private Integer id;
private String url;
private String title;
private String text;
private String time;
public String getUrl() {
return url;
}
public void setUrl(String url) {
this.url = url;
}
public String getTime() {
return time;
}
public void setTime(String time) {
this.time = time;
}
public Integer getId() {
return id;
}
public void setId(Integer id) {
this.id = id;
}
public String getTitle() {
return title;
}
public void setTitle(String title) {
this.title = title;
}
public String getText() {
return text;
}
public void setText(String text) {
this.text = text;
}
@Override
public String toString() {
return "JsoupNewsEntity [id=" + id + ", url=" + url + ", title=" + title + ", text=" + text + ", time=" + time
+ "]";
}
}
dao接口类
public interface NewsDao {
//存储新闻数据
public void InsertNews(JsoupNewsEntity jsoupNewsEntity);
//通过url查询新闻数据
JsoupNewsEntity SelectNewsUrl(String url);
//查询所有新闻链接
List<JsoupNewsEntity> SelectNewsUrlAllList();
}
service接口类
public interface NewsService {
public void insertNews(JsoupNewsEntity jsoupNewsEntity);
JsoupNewsEntity SelectNewsUrl(String url);
//查询所有新闻链接
List<JsoupNewsEntity> SelectNewsUrlAllList();
}
service的实现类
@Service
public class NewsServiceImpl implements NewsService {
@Autowired
private NewsDao newsDao;
// 保存数据
public void insertNews(JsoupNewsEntity jsoupNewsEntity) {
// System.out.println("保存数据");
newsDao.InsertNews(jsoupNewsEntity);
}
// 查询
public JsoupNewsEntity SelectNewsUrl(String url) {
return newsDao.SelectNewsUrl(url);
}
public List<JsoupNewsEntity> SelectNewsUrlAllList() {
return newsDao.SelectNewsUrlAllList();
}
}
controller类
@Controller
public class JsoupNewsController {
Logger logger = Logger.getLogger(this.getClass());
@Autowired
private NewsService newsService;
private static String URL = "https://news.sina.com.cn/china/";
@RequestMapping("jsoup")
public synchronized String JsoupContent(HttpServletRequest req, JsoupNewsEntity jsoupNewsEntity) {
Document newpage = null;
Document page = null;
String times1 = null;
String text1 = null;
int num = 0;
try {
page = Jsoup.connect(URL).ignoreContentType(true).data("query", "Java").userAgent(
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36")
.cookie("auth", "token").timeout(0).get();
} catch (IOException e) {
logger.error("出错>>>>>>>>>>>>>>" + e);
}
Elements linka = page.getElementsByTag("a");
for (Element linkb : linka) {
String gethref = linkb.attr("href");
System.out.println(gethref);
if (gethref.length() > 30) {
if (newsService.SelectNewsUrl(gethref) != null) {
System.out.println("查询到了");
} else {
System.out.println("未查询到,要进行保存" + gethref);
try {
newpage = Jsoup.connect(gethref).ignoreContentType(true).data("query", "Java").userAgent(
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36")
.cookie("auth", "token").timeout(0).get();
} catch (IOException e) {
logger.error("出错>>>>>>>>>>>>>>" + e);
}
try {
Element content = newpage.getElementsByAttributeValue("class", "article").first();
Element time = newpage.getElementsByAttributeValue("class", "date").first();
Element title = newpage.getElementsByAttributeValue("class", "main-title").first();
System.out.println(time != null && title != null && content.select("p").text() != null);
if (title != null && content.select("p").text() != null) {
if (time != null) {
times1 = time.text().toString();
System.out.println("时间" + time + "标题" + title + "内容" + gethref);
} else {
times1 = "获取时间失败";
logger.info(gethref + "连接时间获取失败");
}
{
//这里由于有时候标题和正文里面返回的数据带特殊字符,导致在从数据库查询出来进行转json发送前端,能力有限,自己想了歪招
//就是把里面的特殊字符都替换掉了(这句话是我后补的)
String title1 = title.text().replace(",", "").replace(":", "");
// System.out.println(times1);
Elements p = content.select("p");
text1 = p.text().toString().replace(",", "").replace(":", "");
jsoupNewsEntity.setUrl(gethref);
jsoupNewsEntity.setTitle(title1);
jsoupNewsEntity.setText(text1);
jsoupNewsEntity.setTime(times1);
System.out.println("数据进行保存中");
newsService.insertNews(jsoupNewsEntity);
System.out.println("数据保存完毕");
num++;
}
}
} catch (Exception e) {
logger.error("出错>>>>>>>>>>>>>>" + e);
logger.info(">>>>>>>>>>>>>>" + e);
}
}
}
}
System.out.println("爬取完毕");
req.setAttribute("reg", "更新数据完毕,增加数据" + num + "条");
num = 0;
return "page/jsoupResult";
}
}
sql代码
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE mapper PUBLIC "-//mybatis.org//DTD Mapper 3.0//EN" "http://mybatis.org/dtd/mybatis-3-mapper.dtd">
<mapper namespace="webapidemo.cn.wzy.dao.NewsDao.NewsDao">
<!-- 存储 -->
<insert id="InsertNews"
parameterType="webapidemo.cn.wzy.Entity.JsoupNews.JsoupNewsEntity">
insert into
news(url,title,text,time)values(
#{url},
#{title},
#{text},
#{time}
)
</insert>
<!-- 保存看是否有新的 -->
<select id="SelectNewsUrl"
parameterType="webapidemo.cn.wzy.Entity.JsoupNews.JsoupNewsEntity"
resultType="webapidemo.cn.wzy.Entity.JsoupNews.JsoupNewsEntity">
select * from news where url=#{url}
</select>
<!-- 看新闻 -->
<select id="SelectNewsUrlAllList"
parameterType="webapidemo.cn.wzy.Entity.JsoupNews.JsoupNewsEntity"
resultType="map">
select url,title from news
<where>id > 3700</where>
<!-- & & < < > > " " ' ' -->
</select>
</mapper>
查看新闻请求是动态的页面ajax请求实现
jsp代码
<%@ page language="java" contentType="text/html; charset=utf-8"
pageEncoding="utf-8"%>
<%@ taglib uri="http://java.sun.com/jsp/jstl/core" prefix="c"%>
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
<script type="text/javascript" src="<%=request.getContextPath() %>/jquery/jquery-3.1.1.min.js"></script>
<title>数据获取</title>
</head>
<body>
<script type="text/javascript">
$(document).ready(function() {
jsoup();
})
function jsoup(){
$.ajax({
"url":"toNewsList.do",
"type":"get",
"dataType":"json",
success:function(data){
var box = document.getElementById('test_info');
var boxInfo = '';
var result =data;
for(var i=0; i<data.length; i++){
var url =data[i].url;
var title =data[i].title;
boxInfo = boxInfo + '<ul class="info-item">' +
'<a target="_blank" href='+data[i].url+'>' + data[i].title + '</a>'+
'</ul>'
}
box.innerHTML = boxInfo;
}
})
}
</script>
<div id="test_info" class='test_info'>
</div>
</body>
</html>
哪里不懂可以提问,共同学习探讨
2019-8-5再补充介绍一下,数据保存的数据,可能controller里面有的地方难理解,看不明白的可以问我,
创建数据库语句
CREATE DATABASE /*!32312 IF NOT EXISTS*/`webapi` /*!40100 DEFAULT CHARACTER SET utf8 */;
DROP TABLE IF EXISTS `news`;
CREATE TABLE `news` (
`id` bigint(20) NOT NULL AUTO_INCREMENT COMMENT 'id',
`url` varchar(300) NOT NULL COMMENT '文章链接',
`title` varchar(200) NOT NULL COMMENT '标题',
`text` varchar(20000) NOT NULL COMMENT '正文',
`time` varchar(30) NOT NULL COMMENT '发布时间',
`nowtime` datetime NOT NULL DEFAULT NOW() ,
PRIMARY KEY (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=2 DEFAULT CHARSET=utf8;