架构:Maven + MyBatis + MySQL + Mapper + Jsoup
先上整体架子
数据库表设计
下面就开始上代码了
Day01_BaiduNewsCrawler
package edu.xawl.main;
import edu.xawl.mapper.BaiduNewsMapper;
import edu.xawl.po.BaiduNews;
import edu.xawl.utils.MybatisHelper;
import org.apache.ibatis.session.SqlSession;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class Day01_BaiduNewsCrawler {
public static void main(String[] args) throws Exception {
// 获取SQLSession
SqlSession sqlsessionLocal = MybatisHelper.getSqlsessionLocal();
// 注入要要操作的表mapper
BaiduNewsMapper baiduNewsMapper = sqlsessionLocal.getMapper(BaiduNewsMapper.class);
// 要抓取的网站
String url = "http://top.baidu.com/buzz?b=1";
// 使用爬虫获取
Document document = Jsoup.connect(url).get();
getElmentAndInsert(document,baiduNewsMapper,"实时热点");
// 获取新闻列表
Elements lis = document.select("#flist li");
for (int i = 2; i < lis.size(); i++) {
Element li = lis.get(i);
String title = li.select("a").attr("title");
String href = "http://top.baidu.com" + li.select("a").attr("href").substring(1);
document = Jsoup.connect(href).get();
getElmentAndInsert(document,baiduNewsMapper,title);
}
// 提交事务
sqlsessionLocal.commit();
// 关闭流
sqlsessionLocal.close();
}
public static void getElmentAndInsert(Document document, BaiduNewsMapper baiduNewsMapper,String type) {
// 选择相应的内容
Elements trs = document.select("#main > div.mainBody > div > table tr");
for (Element tr : trs) {
// 获取网页数据
String keyword = tr.select(".list-title").text();
String clazz = tr.select(".tc").text();
String tempNum = tr.select(".last").text();
int num = 0;
try {
num = Integer.parseInt(tempNum);
} catch (NumberFormatException e) {
continue;
}
// 封装到po对象中
BaiduNews baiduNews = new BaiduNews();
baiduNews.setKeyword(keyword);
baiduNews.setType(type);
baiduNews.setClazz(clazz);
baiduNews.setSerchNm(num);
baiduNewsMapper.insert(baiduNews);
}
}
}
pom.xml
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>edu.xawl</groupId>
<artifactId>crawler</artifactId>
<version>1.0-SNAPSHOT</version>
<dependencies>
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>5.1.20</version>
</dependency>
<dependency>
<groupId>org.mybatis</groupId>
<artifactId>mybatis</artifactId>
<version>3.4.6</version>
</dependency>
<dependency>
<groupId>tk.mybatis</groupId>
<artifactId>mapper</artifactId>
<version>4.0.4</version>
</dependency>
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.11.3</version>
</dependency>
<!--可以快速生成实体类的get/set方法-->
<dependency>
<groupId>org.projectlombok</groupId>
<artifactId>lombok</artifactId>
<version>1.18.2</version>
</dependency>
</dependencies>
</project>
BaiduNewsMapper
mapper中实现了单表的增删改查,我们直接继承就可以使用
package edu.xawl.mapper;
import edu.xawl.po.BaiduNews;
import tk.mybatis.mapper.common.Mapper;
public interface BaiduNewsMapper extends Mapper<BaiduNews> {
}
BaiduNews po对象
package edu.xawl.po;
import lombok.Data;
import javax.persistence.Column;
import javax.persistence.Table;
@Data
@Table(name = "baidu_news")
public class BaiduNews {
private Integer id;
private String keyword;
private String type;
private String clazz;
@Column(name = "search_num")
private Integer serchNm;
}
MybatisHelper
package edu.xawl.utils;
import org.apache.ibatis.io.Resources;
import org.apache.ibatis.session.SqlSession;
import org.apache.ibatis.session.SqlSessionFactory;
import org.apache.ibatis.session.SqlSessionFactoryBuilder;
import tk.mybatis.mapper.common.Mapper;
import tk.mybatis.mapper.common.MySqlMapper;
import tk.mybatis.mapper.entity.Config;
import tk.mybatis.mapper.mapperhelper.MapperHelper;
public class MybatisHelper {
private static SqlSessionFactory sqlSessionFactory;
static {
try {
sqlSessionFactory = new SqlSessionFactoryBuilder().build(Resources.getResourceAsReader("mybatis-config.xml"), "local");
SqlSession sessionLocal = null;
try {
sessionLocal = sqlSessionFactory.openSession();
MapperHelper mapperHelper = new MapperHelper();
Config config = new Config();
config.setEnableMethodAnnotation(true);
mapperHelper.setConfig(config);
mapperHelper.registerMapper(Mapper.class);
mapperHelper.registerMapper(MySqlMapper.class);
mapperHelper.processConfiguration(sessionLocal.getConfiguration());
} catch (Exception e) {
e.printStackTrace();
} finally {
if (sessionLocal != null) {
sessionLocal.close();
}
}
} catch (Exception e) {
e.printStackTrace();
}
}
public static SqlSession getSqlsessionLocal() {
return sqlSessionFactory.openSession();
}
}
mybatis-config.xml
<?xml version="1.0" encoding="UTF-8" ?>
<!DOCTYPE configuration
PUBLIC "-//mybatis.org//DTD Config 3.0//EN"
"http://mybatis.org/dtd/mybatis-3-config.dtd">
<configuration>
<!-- 开启驼峰自动映射 -->
<settings>
<!--<setting name="mapUnderscoreToCamelCase" value="false"/>-->
<setting name="defaultStatementTimeout" value="60"/>
</settings>
<typeAliases>
<package name="com.itcast.po"/>
</typeAliases>
<!-- 配置环境,制定数据库连接信息 -->
<environments default="local">
<environment id="local">
<transactionManager type="JDBC"/>
<dataSource type="POOLED">
<property name="driver" value="com.mysql.jdbc.Driver"/>
<property name="url" value="jdbc:mysql://127.0.0.1:3306/crawler"/>
<property name="username" value="root"/>
<property name="password" value="123"/>
</dataSource>
</environment>
</environments>
<mappers>
<package name="edu.xawl.mapper"/>
</mappers>
</configuration>