1.根据CSDN文章类型获取对应类型的文章链接
public void searchCsdnUrl() throws IOException {
// String[] Arr = {"ai","cloud", "db","career","game", "engineering","web",
// "mobile", "iot","ops","fund", "lang", "arch", "avi", "sec","other"};
List<String> urlList=new ArrayList<>();
urlList.add("web");
urlList.add("ai");
urlList.add("cloud");
urlList.add("db");
urlList.add("fund");
urlList.add("career");
urlList.add("game");
urlList.add("engineering");
urlList.add("mobile");urlList.add("sec");
urlList.add("iot");urlList.add("lang");urlList.add("arch");
urlList.add("ops");urlList.add("avi");urlList.add("other");
for(String type:urlList){
String url="https://www.csdn.net/nav/"+type;
//获取url地址的http链接Connection
Connection conn = Jsoup.connect(url) //博客首页的url地址
.userAgent("Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10") //http请求的浏览器设置
.timeout(1000) //http连接时长
.method(Connection.Method.GET); //请求类型是get请求,http请求还是post,delete等方式
//获取页面的html文档
Document doc = conn.get();
Element body = doc.body();
//将爬取出来的文章封装到Artcle中,并放到ArrayList里面去
List<Article> resultList = new ArrayList<Article>(100);
Element articleListDiv = body.getElementById("feedlist_id");
Elements articleList = articleListDiv.getElementsByClass("clearfix");
for(Element article : articleList){
Article articleEntity = new Article();
//标题
Element linkNode = (article.select("div h2 a")).get(0);
//文章简介
Element desptionNode = (article.getElementsByClass("summary oneline")).get(0);
//时间
Element articleManageNode = (article.getElementsByClass("time")).get(0);
//阅读量
Element readNum = (article.getElementsByClass("read_num")).get(0);
Element commentNum = (article.getElementsByClass("common_num ")).get(0);
//文章url
articleEntity.setAddress(linkNode.attr("href"));
articleEntity.setTitle(linkNode.text());
articleEntity.setDesption(desptionNode.text());
//articleEntity.setTime(new Date());
if ("".equals(readNum.getElementsByClass("num").text())) {
articleEntity.setCommentNum(0);
}else {
articleEntity.setReadNum(Integer.parseInt(readNum.getElem