今天遇到一个需求是类似爬页面信息,发送get请求获取整个页面的代码。然后获取其中的信息。我用到了一个工具类分享大家。
maven依赖
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.10.2</version>
</dependency>
package com.example.saasdemo;
import cn.hutool.core.io.file.FileWriter;
import cn.hutool.http.HttpUtil;
import cn.hutool.json.JSONObject;
import lombok.extern.slf4j.Slf4j;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.junit.jupiter.api.Test;
import org.springframework.boot.test.context.SpringBootTest;
import java.util.Date;
import java.util.UUID;
@Slf4j
@SpringBootTest
public class Spider {
final String address_1="秘密";
final String address_2="秘密";
@Test
void testCNNVD(){
for (int i = 1; i <= /*1982*/1; i++) {
String addressNew=address_1.replace("*",String.valueOf(i));
String s = HttpUtil.get(addressNew);
Document doc = Jsoup.parse(s);
Elements rows = doc.select("div[class=list_list]").select("li");
// System.out.println( rows);
for (Element row : rows) {
String a = row.select("a").attr("href");
// System.out.println(a);
String s2 = HttpUtil.get(address_2+a);
Document doc2 = Jsoup.parse(s2);
Elements div = doc2.select("div[class=fl w770]").select("div");
// System.out.println(div);
JSONObject pageInfo=new JSONObject();
StringBuilder page= new StringBuilder();
// 漏洞信息详情
StringBuilder str_w770= new StringBuilder();
String h2_w770 = div.select("div[class=title_bt w770]").select("H2").text();
Elements li_w770 = div.select("div[class=detail_xq w770]").select("li");
String str_li_w770 = div.select("div[class=detail_xq w770]").select("H2").text();
for (Element element : li_w770) {
String span = element.select("span").text();
String a1 = element.select("a").text();
str_w770.append(span).append(a1).append("\n");
}
page.append(h2_w770).append("\n").append(str_li_w770).append("\n").append(str_w770);
// 漏洞简介
String H2_bt = div.select("div[class=d_ldjj]").select("div[class=title_bt]").select("H2").text();
Elements li_bt = div.select("div[class=d_ldjj]").select("p");
StringBuilder str_bt= new StringBuilder();
for (Element element : li_bt) {
String span = element.text();
str_bt.append(span);
}
page.append(H2_bt).append("\n").append(str_bt).append("\n");
// System.out.println(page);
// 漏洞公告 参考网址 受影响实体 补丁
// String H2_m_t_20 = div.select("div[class=d_ldjj m_t_20]").select("div[class=title_bt]").select("H2").text();
// String[] H2_m_t_20_array = H2_m_t_20.split(" ");
Elements li_m_t_20 = div.select("div[class=d_ldjj m_t_20]")/*.select("p")*/;
StringBuilder str_m_t_20= new StringBuilder();
for (Element element : li_m_t_20) {
String H2 = element.select("div[class=title_bt]").select("H2").text();
Elements p = element.select("p");
StringBuilder pInfo= new StringBuilder();
for (Element element1 : p) {
String text = element1.text();
// String a1 = element1.select("a").text();
pInfo.append(text).append("\n");
}
// str_m_t_20.append(H2).append("\n").append(pInfo).append("\n");
Elements li = element.select("div[class=vulnerability_list]").select("li");
StringBuilder li_str= new StringBuilder();
for (Element element1 : li) {
String text = element1.text();
li_str.append(text).append("\n");
}
if(pInfo.length()>0 && !"".equals(pInfo.toString())) {
str_m_t_20.append(H2).append("\n").append(pInfo).append("\n").append(li_str).append("\n");
}else {
str_m_t_20.append(H2).append("\n").append(li_str).append("\n");
}
}
page.append(str_m_t_20).append("\n");
pageInfo.set("info",page);
pageInfo.set("id", UUID.randomUUID().toString());
pageInfo.set("date", new Date().toString());
// System.out.println(pageInfo.get("info"));
String filePath = "D:\\破烂\\秘密.txt";
FileWriter writer = new FileWriter(filePath);
writer.write("---------------------------------------------\n"+page, true);
}
}
}
}