java爬取国家应急平台漏洞公告数据

最新推荐文章于 2024-06-25 15:00:07 发布

过客璇璇

最新推荐文章于 2024-06-25 15:00:07 发布

阅读量1k

点赞数 2

分类专栏： java语言

本文链接：https://blog.csdn.net/qq_36706878/article/details/102572372

版权

java语言专栏收录该内容

46 篇文章 2 订阅

订阅专栏

java爬取http类型的网站比较容易实现，因为不需要建立证书的通道，直接通过httpclient访问链接获取相应源码就可以获取相关数据，现在我们可以通过证书的方式，实现java爬取https网站的相关数据。

获取https网站证书的方式可以看我上一篇博客：https://blog.csdn.net/qq_36706878/article/details/102546563

这里直接附上源码和运行结果

import java.io.File;
import java.io.IOException;

import javax.net.ssl.SSLContext;

import org.apache.http.*;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.conn.ssl.SSLConnectionSocketFactory;
import org.apache.http.conn.ssl.TrustSelfSignedStrategy;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.ssl.SSLContexts;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.apache.commons.logging.*;

public class HttpsReptile {
	//国家应急中心域名
	public static String mainUrl = "https://www.cert.org.cn";
	
	public static void httpsReptile(){
		 //国家应急中心漏洞公告页面
		 String leakUrl = "https://www.cert.org.cn/publish/main/9/index.html";
		 //处理第一页的源码
		 Document leakListDoc = sendHttp(leakUrl);
		 int page = 1;
		 System.out.println("处理第"+page+"页数据");
		 solvePageData(leakListDoc);
		 getPageCode(leakListDoc,page);
		 
	  
	}
	/**
	 * 获取下一页的源码
	 * @param leakListDoc
	 * @param page
	 */
	public static void getPageCode(Document leakListDoc,int page) {
		//获取下一页的链接 访问获取源码
		 Elements aElements = leakListDoc.getElementsByTag("a");
		 for(int i=0;i<aElements.size();i++){
			 
			 Element aString = aElements.get(i);
			 if(aString.toString().contains("下一页")) {
				 String nextHref = aString.attr("href");
				 nextHref = mainUrl + nextHref;
				 //处理下一页的源码
				 Document newPageLeakListDoc = sendHttp(nextHref);
				 page++;
				 System.out.println("处理第"+page+"页数据");
				 solvePageData(newPageLeakListDoc);
				 getPageCode(newPageLeakListDoc,page);
				
			 }
		 }
	}
	/**
	 * 处理某一页的漏洞公告
	 * @param leakListDoc
	 */
	public static void solvePageData(Document leakListDoc) {
		Elements ulElements = leakListDoc.getElementsByClass("waring_con");
		 for(int i=0;i<ulElements.size();i++){
			 Elements liElements = ulElements.get(i).getElementsByTag("li");
			 for(int j=0;j<liElements.size();j++) {
				 String onclickvalue = liElements.get(j).toString();
				 onclickvalue = onclickvalue.substring(85, 159);
				 System.out.println(onclickvalue);
				 
			 }
		 }
	}
	
	public static Document sendHttp(String url) {
        String html = "";
        CloseableHttpClient httpclient = null;
        CloseableHttpResponse response = null;
        try {
            SSLConnectionSocketFactory sslsf = createSSLConnSocketFactory();
            httpclient = HttpClients.custom()
                .setSSLSocketFactory(sslsf).build();
            HttpGet httpget = new HttpGet(url);
            httpget.addHeader(HttpHeaders.USER_AGENT,
                    "Mozilla/5.0 (Windows NT 6.3; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0");
            RequestConfig requestConfig = RequestConfig.custom()
                    .setSocketTimeout(10000).setConnectTimeout(10000).build();// 设置请求和传输超时时间
            httpget.setConfig(requestConfig);
            System.out.println("Executing request " + httpget.getRequestLine());
            response = httpclient.execute(httpget);
            HttpEntity entity = response.getEntity();
            System.out.println("----------------------------------------");
            System.out.println(response.getStatusLine());
            int resStatu = response.getStatusLine().getStatusCode();// 返回码
            if (resStatu == HttpStatus.SC_OK) {// 200正常 其他就不对
                // 获得相应实体
                if (entity != null) {
                    html = EntityUtils.toString(entity, "UTF-8");
                    html = html.replace("&nbsp;", " ");
                    //System.out.println(html);
                }
            }
            EntityUtils.consume(entity);
        } catch(Exception e){
            e.printStackTrace();
        }finally{
            if(response!=null){
                try {
                    response.close();
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
            if(httpclient!=null){
                try {
                    httpclient.close();
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
        }
        Document document = Jsoup.parse(html);
		return document;
    }
	
	// ssl通道证书的创建
    private static SSLConnectionSocketFactory createSSLConnSocketFactory()
            throws Exception {
        SSLContext sslcontext = SSLContexts
                .custom()
                .loadTrustMaterial(
                        new File(
                                "E:/key.keystore"),
                        "123456".toCharArray(), new TrustSelfSignedStrategy())   //文件和密码要对应
                .build();
        SSLConnectionSocketFactory sslsf = new SSLConnectionSocketFactory(
                sslcontext, new String[] { "TLSv1" }, null,
                SSLConnectionSocketFactory.getDefaultHostnameVerifier());
        return sslsf;
    }

	
	public static void main(String[] args) {
		// TODO Auto-generated method stub
		httpsReptile();
		//sendHttp("https://blog.csdn.net/qq_36706878");
	}

}

运行结果如下获取到详情页的各个链接再次获取数据就行

过客璇璇

关注

2
点赞
踩
5

收藏

觉得还不错? 一键收藏
0
评论
java爬取国家应急平台漏洞公告数据

java爬取http类型的网站比较容易实现，因为不需要建立证书的通道，直接通过httpclient访问链接获取相应源码就可以获取相关数据，现在我们可以通过证书的方式，实现java爬取https网站的相关数据。获取https网站证书的方式可以看我上一篇博客：https://blog.csdn.net/qq_36706878/article/details/102546563这里直接附上源码和...
复制链接

扫一扫

专栏目录