多级爬取
本案例基于springboot搭建的maven项目,由于我是存入了数据库,所以下面是存入数据库的配置,不需要的话,可自行更换数据保存位置。
webMagic中文官网,大家可自行深入学习。
第一步 导入依赖
<parent>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-parent</artifactId>
<version>2.1.3.RELEASE</version>
</parent>
<dependencies>
<!-- Compile -->
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter</artifactId>
</dependency>
<!-- Test -->
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-test</artifactId>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.apache.logging.log4j</groupId>
<artifactId>log4j-core</artifactId>
<version>2.8.2</version>
</dependency>
<dependency>
<groupId>org.apache.logging.log4j</groupId>
<artifactId>log4j-api</artifactId>
<version>2.8.2</version>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-web</artifactId>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-actuator</artifactId>
</dependency>
<!-- webmagic核心依赖 -->
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-core</artifactId>
<version>0.7.3</version>
</dependency>
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-extension</artifactId>
<version>0.7.3</version>
<exclusions>
<exclusion>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-log4j12</artifactId>
</exclusion>
</exclusions>
</dependency>
<!-- lombok注解 -->
<dependency>
<groupId>org.projectlombok</groupId>
<artifactId>lombok</artifactId>
</dependency>
<!-- 数据库连接 -->
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
</dependency>
</dependencies>
第二步 配置爬虫入口
public static void main(String[] args) throws Exception {
long startTime, endTime;
System.out.println("========小爬虫【启动】喽!=========");
startTime = new Date().getTime();
Spider.create(new BlogPageProcessor()).addUrl(homeUrl).thread(5).run();
endTime = new Date().getTime();
System.out.println("========小爬虫【结束】喽!=========");
System.out.println("一共爬到" + num + "条数据!用时为:" + (endTime - startTime) / 1000 + "s");
}
第三步 创建实体类
@Data
public class BlogInfo {
private String url;
private String title;
private String content;
//以下字段没有用到
private String author;
private String readNum;
private String recommendNum;
private String blogHomeUrl;
private String commentNum;
private String publishTime;
}
第四步 创建保存数据接口
public interface BlogDao {
/**
* 保存信息
* @param blog
* @return
*/
public int saveBlog(BlogInfo blog);
}
第五步 创建保存数据接口实现类
public class BlogDaoImpl implements BlogDao{
@Override
public int saveBlog(BlogInfo blog) {
DBHelper dbhelper = new DBHelper();
StringBuffer sql = new StringBuffer();
sql.append("INSERT INTO hot_weekly_blogs(url,title,author,readNum,
recommendNum,blogHomeUrl,commentNum,publishTime,content)")
.append("VALUES (? , ? , ? , ? , ? , ? , ? , ? , ? ) ");
//设置 sql values 的值
List<String> sqlValues = new ArrayList<>();
sqlValues.add(blog.getUrl());
sqlValues.add(blog.getTitle());
sqlValues.add(blog.getAuthor());
sqlValues.add(""+blog.getReadNum());
sqlValues.add(""+blog.getRecommendNum());
sqlValues.add(blog.getBlogHomeUrl());
sqlValues.add(""+blog.getCommentNum());
sqlValues.add(blog.getPublishTime());
sqlValues.add(blog.getContent());
int result = dbhelper.executeUpdate(sql.toString(), sqlValues);
return result;
}
}
第六步 创建数据库实例对象
public class DBHelper {
public static final String driver_class = "com.mysql.cj.jdbc.Driver";
public static final String driver_url = "jdbc:mysql://localhost/webmagic?useUnicode=true&characterEncoding=utf8&zeroDateTimeBehavior=convertToNull&useSSL=true&serverTimezone=GMT%2B8&allowMultiQueries=true";
public static final String user = "root";
public static final String password = "123456";
private static Connection conn = null;
private PreparedStatement pst = null;
private ResultSet rst = null;
/**
* Connection
*/
public DBHelper() {
try {
conn = DBHelper.getConnInstance();
} catch (Exception e) {
e.printStackTrace();
}
}
/**
* 单例模式
* 线程同步
*
* @return
*/
private static synchronized Connection getConnInstance() {
if (conn == null) {
try {
Class.forName(driver_class);
conn = DriverManager.getConnection(driver_url, user, password);
} catch (ClassNotFoundException e) {
e.printStackTrace();
} catch (SQLException e) {
e.printStackTrace();
}
System.out.println("连接数据库成功");
}
return conn;
}
/**
* close
*/
public void close() {
try {
if (conn != null) {
DBHelper.conn.close();
}
if (pst != null) {
this.pst.close();
}
if (rst != null) {
this.rst.close();
}
System.out.println("关闭数据库成功");
} catch (SQLException e) {
e.printStackTrace();
}
}
/**
* query
*
* @param sql
* @param sqlValues
* @return ResultSet
*/
public ResultSet executeQuery(String sql, List<String> sqlValues) {
try {
pst = conn.prepareStatement(sql);
if (sqlValues != null && sqlValues.size() > 0) {
setSqlValues(pst, sqlValues);
}
rst = pst.executeQuery();
} catch (SQLException e) {
e.printStackTrace();
}
return rst;
}
/**
* update
*
* @param sql
* @param sqlValues
* @return result
*/
public int executeUpdate(String sql, List<String> sqlValues) {
int result = -1;
try {
pst = conn.prepareStatement(sql);
if (sqlValues != null && sqlValues.size() > 0) {
setSqlValues(pst, sqlValues);
}
result = pst.executeUpdate();
} catch (SQLException e) {
e.printStackTrace();
}
return result;
}
/**
* sql set value
*
* @param pst
* @param sqlValues
*/
private void setSqlValues(PreparedStatement pst, List<String> sqlValues) {
for (int i = 0; i < sqlValues.size(); i++) {
try {
pst.setObject(i + 1, sqlValues.get(i));
} catch (SQLException e) {
e.printStackTrace();
}
}
}
}
第七步 最重要的一步,抓取链接数据,分析数据内容,根据内容格式做解析
需要用到Xpath语法,参考:https://www.cnblogs.com/wendyw/p/11633588.html
public class BlogPageProcessor implements PageProcessor {
//抓取网站的相关配置,包括:编码、抓取间隔、重试次数等
private Site site = Site.me().setRetryTimes(10).setSleepTime(1000);
//博文数量
private static int num = 0;
private static String homeUrl="https://geek-docs.com/";
//数据库持久化对象,用于将博文信息存入数据库
private BlogDao blogDao = new BlogDaoImpl();
public static void main(String[] args) throws Exception {
long startTime, endTime;
System.out.println("========小爬虫【启动】喽!=========");
startTime = new Date().getTime();
Spider.create(new BlogPageProcessor()).addUrl(homeUrl).thread(5).run();
endTime = new Date().getTime();
System.out.println("========小爬虫【结束】喽!=========");
System.out.println("一共爬到" + num + "条数据!用时为:" + (endTime - startTime) / 1000 + "s");
}
@Override
public void process(Page page) {
//输出抓取的页面内容
String content1 = page.getHtml().get();
System.out.println(content1);
try {
if (homeUrl.equals(page.getUrl().toString())){
/*实例化BlogInfo,方便持久化存储。*/
BlogInfo blog = new BlogInfo();
//抓取所有标题
List<String> all = page.getHtml().xpath("//div[@class='item']//a/text()").all();
for (int i = 0; i < all.size(); i++) {
String title = page.getHtml().xpath("//div[@class='item']//a/text()").all().get(i);
//url
String url = page.getHtml().xpath("//div[@class='item']//a/@href").all().get(i);
//博文内容,这里只获取带html标签的内容,后续可再进一步处理
String content = page.getHtml().xpath("//div[@class='item']//a/@title").all().get(i);
//对象赋值
blog.setUrl(url);
blog.setTitle(title);
blog.setContent(content);
num++;//数据条数++
System.out.println("num:" + num + " " + blog.toString());//输出对象
blogDao.saveBlog(blog);//保存抓取信息到数据库
}
//请求第二层数据
List<String> titles = page.getHtml().xpath("//div[@class='item']//a/text()").all();
List<String> titleUrls = page.getHtml().xpath("//div[@class='item']//a/@href").all();
for (int i = 0; i < titles.size(); i++) {
Request req = new Request();
Item item = new Item(titles.get(i),titleUrls.get(i),null);
req.putExtra("item",item);
req.setUrl(titleUrls.get(i));
page.addTargetRequest(req);
page.addTargetRequest("");
}
}else {
System.out.println("分栏数据html");
}
} catch (Exception e) {
e.printStackTrace();
}
}
@Override
public Site getSite() {
return this.site;
}
}
class Item{
private String itemTitle;//分栏标题
private String itemUrl;//分栏url
private Map<String,String> items;//分栏的博文 <标题,链接>
public String getItemTitle() {
return itemTitle;
}
public void setItemTitle(String itemTitle) {
this.itemTitle = itemTitle;
}
public String getItemUrl() {
return itemUrl;
}
public void setItemUrl(String itemUrl) {
this.itemUrl = itemUrl;
}
public Map<String, String> getItems() {
return items;
}
public void setItems(Map<String, String> items) {
this.items = items;
}
public Item() {
}
public Item(String itemTitle, String itemUrl, Map<String, String> items) {
this.itemTitle = itemTitle;
this.itemUrl = itemUrl;
this.items = items;
}
}
最后 数据库创建语句
SET NAMES utf8mb4;
SET FOREIGN_KEY_CHECKS = 0;
-- ----------------------------
-- Table structure for hot_weekly_blogs
-- ----------------------------
DROP TABLE IF EXISTS `hot_weekly_blogs`;
CREATE TABLE `hot_weekly_blogs` (
`id` int(11) NOT NULL AUTO_INCREMENT,
`url` varchar(100) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
`title` varchar(100) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
`author` varchar(50) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
`readNum` varchar(11) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
`recommendNum` varchar(11) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
`blogHomeUrl` varchar(100) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
`commentNum` varchar(11) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT '0',
`publishTime` varchar(20) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
`content` mediumtext CHARACTER SET utf8 COLLATE utf8_general_ci NULL,
PRIMARY KEY (`id`) USING BTREE
) ENGINE = InnoDB AUTO_INCREMENT = 240 CHARACTER SET = utf8 COLLATE = utf8_general_ci COMMENT = '爬取的信息存储表' ROW_FORMAT = Dynamic;
SET FOREIGN_KEY_CHECKS = 1;
每篇一句
没有醒不来的早晨,弄不懂的题目,熬不过的迷茫,只有你不敢追的梦。