1.继续在昨天的工程上学习,所以就不用再写配置文件pom文件,myBatis和util
首先创建数据库表和实体类V2ex
CREATE TABLE `v2ex` (
`id` int(11) NOT NULL AUTO_INCREMENT COMMENT '自增主键',
`title` varchar(255) DEFAULT NULL COMMENT '标题',
`url` varchar(255) DEFAULT NULL COMMENT '地址',
`user` varchar(255) DEFAULT NULL COMMENT '发帖的用户',
`type` varchar(255) DEFAULT NULL COMMENT '大分类',
`clazz` varchar(255) DEFAULT NULL COMMENT '小分类',
`up_time` varchar(255) DEFAULT NULL COMMENT '发帖相对时间',
`reply_num` int(11) DEFAULT NULL COMMENT '回复数量',
`crawler_time` datetime DEFAULT NULL COMMENT '抓取时间',
PRIMARY KEY (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=1945 DEFAULT CHARSET=utf8;
package com.hua.po;
import javax.persistence.Column;
import javax.persistence.Table;
import java.util.Date;
/**
* Created by hua on 2019/3/31.
*/
@Table(name = "v2ex")
public class V2ex {
private Integer id;
private String title;
private String url;
private String user;
private String type;
private String clazz;
@Column(name = "up_time")
private String upTime;
@Column(name = "crawler_time")
private Date crawlerTime;
@Column(name = "reply_num")
private Integer replyNum;
public Integer getId() {
return id;
}
public void setId(Integer id) {
this.id = id;
}
public String getTitle() {
return title;
}
public void setTitle(String title) {
this.title = title;
}
public String getUrl() {
return url;
}
public void setUrl(String url) {
this.url = url;
}
public String getUser() {
return user;
}
public void setUser(String user) {
this.user = user;
}
public String getType() {
return type;
}
public void setType(String type) {
this.type = type;
}
public String getClazz() {
return clazz;
}
public void setClazz(String clazz) {
this.clazz = clazz;
}
public String getUpTime() {
return upTime;
}
public void setUpTime(String upTime) {
this.upTime = upTime;
}
public Date getCrawlerTime() {
return crawlerTime;
}
public void setCrawlerTime(Date crawlerTime) {
this.crawlerTime = crawlerTime;
}
public Integer getReplyNum() {
return replyNum;
}
public void setReplyNum(Integer replyNum) {
this.replyNum = replyNum;
}
}
2.创建主方法Day02_V2exCrawler
package com.hua.main;
import com.hua.mapper.V2exMapper;
import com.hua.po.V2ex;
import com.hua.util.MybatisHelper;
import org.apache.ibatis.session.SqlSession;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.IOException;
import java.util.Date;
/**
* Created by hua on 2019/3/31.
*/
public class Day02_V2exCrawler {
public static void main(String[] args) throws IOException {
// 获取数据库操作对象
SqlSession sqlSession = MybatisHelper.getSqlSessionLocal();
V2exMapper v2exMapper = sqlSession.getMapper(V2exMapper.class);
// 目标网站url
String url = "https://www.v2ex.com";
// 获取网页文档对象
Document doc = Jsoup.connect(url).get();
Elements as = doc.select("#Tabs a");
for (Element a : as) {
// 大分类url
String href = url + a.attr("href");
// 跳过这两个分类, 原因这两个分类里面的内容是其他分类的聚合, 抓取了其他分类, 这两分类已包括
if (href.equals("/?tab=hot") || href.equals("/?tab=all")) {
continue;
}
// 大分类的值
String type = a.text();
// 获取大分类所有文章
Document typeDoc = Jsoup.connect(href).get();
Elements divs = typeDoc.select("div.cell.item");
for (Element div : divs) {
// 要保存的文章数据
V2ex v2ex = new V2ex();
// 获取url 并赋值
String href1 = url + div.select("span.item_title a").attr("href");
v2ex.setUrl(href1);
// 获取标题并赋值
String title = div.select("span.item_title a").text().replaceAll("[\\x{10000}-\\x{10FFFF}]", "");
System.out.println(title);
v2ex.setTitle(title);
// 为大分类赋值
v2ex.setType(type);
String[] temp = div.select("span.topic_info").text().trim().replaceFirst("[1-9]","").trim().split(" ");
try {
// 获取小分类并赋值
v2ex.setClazz(temp[0]);
// 获取用户名并赋值
v2ex.setUser(temp[2]);
// 获取发帖的相对时间
String time = temp[5];
if (time.equals("小时")) {
v2ex.setUpTime(temp[4] + temp[5] + temp[6] + temp[7]);
} else {
v2ex.setUpTime(temp[4] + temp[5]);
}
}catch (Exception e){
}
// 设置抓取时间
v2ex.setCrawlerTime(new Date());
// 获取回帖数并赋值
String tempReplyNum = div.select("a.count_livid").text();
Integer replyNum = null;
try {
replyNum = Integer.parseInt(tempReplyNum);
} catch (NumberFormatException e) {
}
v2ex.setReplyNum(replyNum);
v2exMapper.insert(v2ex);
}
}
// 提交事务
sqlSession.commit();
// 关流 -> 关闭连接
sqlSession.close();
}
}