1. 设计思路
因为博客有分页功能,所以想获取全部博客的信息一定要先计算总共有多少页,当前页爬取完后跳转到下一页的链接爬取新的博客信息;
有两种方式来获取页数:
1. 通过爬取分页的数值
但在获取class信息上此种方式辨识度不足,在选中状态下class会进行变化
2. 获取总博客数/每页条数=页数
此种方式获取便签内容只有一条辨识度足够,进行正则获取数值即可获得总博客数,但是在每页条数进行系统变化的时候可能获得的分页结果会不准确,目前csdn是每页40条如果变为20则会出现数据差异。
获取完页数后就需要遍历每页博客的地址来获取不同页的博客信息
目前分页地址只是数值代表变更的地址栏,如此可以直接遍历按页数进行拼接地址获取博客内容
完整代码最后会贴,如下只是获取页数的部分代码:
public static List allArtitcle()throws IOException{
Connection conn = Jsoup.connect(URL)
.userAgent("Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:50.0) Gecko/20100101 Firefox/50.0")
.timeout(5000)
.method(Connection.Method.GET);
Document doc = conn.get();
Element body = doc.body();
//获取总页数
// 获取博客总数的代码
Element articleListDiv = body.getElementById("container-header-blog");
// 获取span标签的内容
String totalPageStr = articleListDiv.select("span").text();
// 正则取数字
String regEx="[^0-9]";
Pattern p = Pattern.compile(regEx);
Matcher m = p.matcher(totalPageStr);
int totalPage = (int) Math.ceil(Double.valueOf(m.replaceAll("").trim())/40L);
int pageNow = 1; // 初始页数
// 遍历传递页数进行下一个方法的地址拼接
List<Article> articleList = new ArrayList<Article>();
for(pageNow = 1; pageNow <= totalPage; pageNow++){
articleList.addAll(getArtitcleByPage(pageNow));
}
return articleList;
}
页数处理完毕后每页的博客也该处理爬取了,先观察页面的布局特点
可以看到是articleMeList-blog这个id下article-list这个class下面组成的一个个div构成的平级目录
articleMeList-blog -> article-list -> [<div><div/>,<div></div>]
如此就按级获取elment和elments,而在进行遍历的时候需要获取h4标签下的a标签的内容,代码如下:
public static List getArtitcleByPage(int pageNow)throws IOException{
//获取url地址的http链接Connection
Connection conn = Jsoup.connect(URL+""+pageNow+"") //博客首页的url地址
.userAgent("Mozilla/5.0 (Windows NT 6.1; rv:47.0) Gecko/20100101 Firefox/47.0") //http请求的浏览器设置
.timeout(5000) //http连接时长
.method(Connection.Method.GET); //请求类型是get请求,http请求还是post,delete等方式
//获取页面的html文档
Document doc = conn.get();
Element body = doc.body();
//将爬取出来的文章封装到Artcle中,并放到ArrayList里面去
List<Article> resultList = new ArrayList<Article>();
Element articleListDiv = body.getElementById("articleMeList-blog");
Elements articleList = articleListDiv.getElementsByClass("article-list");
Elements articleItem = null;
try {
articleItem = articleList.get(0).getElementsByClass("article-item-box csdn-tracking-statistics");
} catch (Exception e){
System.out.println("aa");
}
for(Element article : articleItem){
Article articleEntity = new Article();
Element linkNode = (article.select("h4 a")).get(0); // 获取h4标签下的a标签
articleEntity.setAddress(linkNode.attr("href")); // 获取a表情的href属性的值
articleEntity.setTitle(linkNode.text()); // 获取a标签内的text文本
resultList.add(articleEntity);
}
Thread task;
return resultList;
}
2. 完整代码,copy即可用
package com.jingan.jinganservice.task;
import com.jingan.jinganpublic.util.HttpUtil;
import com.jingan.jinganservice.model.Article;
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.springframework.scheduling.annotation.Scheduled;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* @author 余生大大
* @title: ImageCrawling
* @projectName jinganplatform
* @description: TODO
* @date 2021/10/22 0022下午 16:37
*/
public class ImageCrawling {
public static int TASKSIZE = 1;
// 文章数据集合
public static List<Article> staticArticleList;
// 获取爬取的文章
static {
try {
staticArticleList = allArtitcle();
} catch (IOException e) {
e.printStackTrace();
}
}
@Scheduled(fixedRate = 60*1000)
void test(){
HttpUtil httpUtil = new HttpUtil();
for (Article article:staticArticleList) {
httpUtil.sendGet(article.getAddress(),null);
}
System.out.println(TASKSIZE++);
}
private static final String URL = "https://blog.csdn.net/AnNanDu/article/list/";
public static void main(String[] args) throws IOException {
List<Article> articleList = allArtitcle();
//遍历输出博主所有的文章
for(Article article : articleList) {
System.out.println("文章标题:" + article.getTitle());
System.out.println("文章绝对路劲地址:" + article.getAddress());
}
System.out.println(articleList.size());
}
public static List allArtitcle()throws IOException{
Connection conn = Jsoup.connect(URL)
.userAgent("Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:50.0) Gecko/20100101 Firefox/50.0")
.timeout(5000)
.method(Connection.Method.GET);
Document doc = conn.get();
Element body = doc.body();
//获取总页数
// 获取博客总数的代码
Element articleListDiv = body.getElementById("container-header-blog");
// 获取span标签的内容
String totalPageStr = articleListDiv.select("span").text();
// 正则取数字
String regEx="[^0-9]";
Pattern p = Pattern.compile(regEx);
Matcher m = p.matcher(totalPageStr);
int totalPage = (int) Math.ceil(Double.valueOf(m.replaceAll("").trim())/40L);
int pageNow = 1;
List<Article> articleList = new ArrayList<Article>();
for(pageNow = 1; pageNow <= totalPage; pageNow++){
articleList.addAll(getArtitcleByPage(pageNow));
}
return articleList;
}
public static List getArtitcleByPage(int pageNow)throws IOException{
//获取url地址的http链接Connection
Connection conn = Jsoup.connect(URL+""+pageNow+"") //博客首页的url地址
.userAgent("Mozilla/5.0 (Windows NT 6.1; rv:47.0) Gecko/20100101 Firefox/47.0") //http请求的浏览器设置
.timeout(5000) //http连接时长
.method(Connection.Method.GET); //请求类型是get请求,http请求还是post,delete等方式
//获取页面的html文档
Document doc = conn.get();
Element body = doc.body();
//将爬取出来的文章封装到Artcle中,并放到ArrayList里面去
List<Article> resultList = new ArrayList<Article>();
Element articleListDiv = body.getElementById("articleMeList-blog");
Elements articleList = articleListDiv.getElementsByClass("article-list");
Elements articleItem = null;
try {
articleItem = articleList.get(0).getElementsByClass("article-item-box csdn-tracking-statistics");
} catch (Exception e){
System.out.println("aa");
}
for(Element article : articleItem){
Article articleEntity = new Article();
Element linkNode = (article.select("h4 a")).get(0);
articleEntity.setAddress(linkNode.attr("href"));
articleEntity.setTitle(linkNode.text());
resultList.add(articleEntity);
}
Thread task;
return resultList;
}
}
3. 结果