java爬虫爬取网站信息

java爬http 网站内容

这种方法很多了

1、

    public static String doHttpGetHtml(String url) {
        //1.生成httpclient,相当于该打开一个浏览器
        CloseableHttpClient httpClient = HttpClients.createDefault();
        CloseableHttpResponse response = null;
        //2.创建get请求,相当于在浏览器地址栏输入 网址
        HttpGet request = new HttpGet(url);
        try {
            //3.执行get请求,相当于在输入地址栏后敲回车键
            response = httpClient.execute(request);

            //4.判断响应状态为200,进行处理
            HttpEntity httpEntity = null;
            if(response.getStatusLine().getStatusCode() == HttpStatus.SC_OK) {
                //5.获取响应内容
                httpEntity = response.getEntity();
                String html = EntityUtils.toString(httpEntity, "utf-8");
                return html;
            } else {
                //如果返回状态不是200,比如404(页面不存在)等,根据情况做处理,这里略
                LOGGER.error("返回状态不是200");
                httpEntity = response.getEntity();
                String html = EntityUtils.toString(httpEntity, "utf-8");
                return html;
            }
        } catch (ClientProtocolException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
        } finally {
            //6.关闭
            HttpClientUtils.closeQuietly(response);
            HttpClientUtils.closeQuietly(httpClient);
        }
        return null;
    }

2、

    public static String doHttpGetHtml2(String strUrl) {
        try {
            URL url=new URL(strUrl);
            //通过url建立与网页的连接
            URLConnection conn=url.openConnection();
            //通过链接取得网页返回的数据
            InputStream is=conn.getInputStream();
            //一般按行读取网页数据,并进行内容分析
            //因此用BufferedReader和InputStreamReader把字节流转化为字符流的缓冲流
            //进行转换时,需要处理编码格式问题
            BufferedReader br=new BufferedReader(new InputStreamReader(is,"GB2312"));

            //按行读取并打印
            String line=null;
            StringBuffer html = new StringBuffer("");
            while((line=br.readLine())!=null){
                System.out.println(line);
                html.append(line);
            }

            br.close();
            return html.toString();
        } catch (Exception e) {
            e.printStackTrace();
        }
        return "null";
    }

 

java爬https 网站内容

下载网站的证书 

查看

点击检查——点击security出现如下界面

 点击证书路径 我们可以发现证书如下:

下载

使用IE浏览器访问该网站,点击工具——Internet选项

 

 接下来点击导出

 下一步

选择保存的目录以及文件名

java命令行将证书生成秘钥库

进入JDK 安装bin目录,命令行运行:

keytool -import -alias Root -file D:/usr/key.cer  -keystore “D:/usr/key.keystore” -storepass 123456

查看请求头

代码

   public static String getHttps(String url, String charset) {
        String html = "";
        // 生成httpclient,相当于该打开一个浏览器
        CloseableHttpClient httpClient = null;
        CloseableHttpResponse response = null;
        try {
            SSLConnectionSocketFactory sslsf = createSSLConnSocketFactory();
            httpClient = HttpClients.custom()
                    .setSSLSocketFactory(sslsf).build();
            // 创建get请求,相当于在浏览器地址栏输入 网址
            HttpGet request = new HttpGet(url);
            request.addHeader("user-agent","Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.92 Safari/537.36");
            RequestConfig config = RequestConfig.custom()
                .setSocketTimeout(10000).setConnectTimeout(10000).build();// 设置请求和传输超时时间
            request.setConfig(config);
            System.out.println("Executing request " + request.getRequestLine());

            //3.执行get请求,相当于在输入地址栏后敲回车键
            response = httpClient.execute(request);
            HttpEntity httpEntity = response.getEntity();
            System.out.println("----------------------------------------");
            System.out.println(response.getStatusLine());

            int resStatu = response.getStatusLine().getStatusCode();// 返回码
            //4.判断响应状态为200,进行处理
            if (resStatu == HttpStatus.SC_OK) {
                //5.获取响应内容
                if (httpEntity != null) {
                    html = EntityUtils.toString(httpEntity, charset);
                    html = html.replace(" ", " ");
                }
            } else {
                //如果返回状态不是200,比如404(页面不存在)等,根据情况做处理,这里略
                System.out.println("返回状态不是200");
                System.out.println(EntityUtils.toString(response.getEntity(), charset));
            }
            EntityUtils.consume(httpEntity);
        } catch (Exception e) {
            e.printStackTrace();
        } finally {
            if(response!=null){
                try {
                    response.close();
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
            if(httpClient!=null){
                try {
                    httpClient.close();
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
        }
        return html;
    }

    // ssl通道证书的创建
    private static SSLConnectionSocketFactory createSSLConnSocketFactory()
            throws Exception {
        SSLContext sslcontext = SSLContexts
                .custom()
                .loadTrustMaterial(
                        new File(
                                "D:\\usr\\key.keystore"),
                        "123456".toCharArray(), new TrustSelfSignedStrategy())   //文件和密码要对应
                .build();
        SSLConnectionSocketFactory sslsf = new SSLConnectionSocketFactory(
                sslcontext, new String[] { "TLSv1" }, null,
                SSLConnectionSocketFactory.getDefaultHostnameVerifier());
        return sslsf;
    }

 另外一种方式

    public static String sendHttps(String url) throws Exception {
        InputStream in = null;
        OutputStream out = null;
        String returnValue = "";
        StringBuffer str = new StringBuffer("");
        try {
            Thread.currentThread().sleep(5000);
            //SSLContext sc = SSLContext.getInstance("SSL");
//            System.setProperty("https.protocols", "TLSv1.2,TLSv1.1,SSLv3");
//            SSLContext sc = SSLContext.getInstance("TLS", "SunJSSE");
            SSLContext sc = SSLContext.getInstance("TLS");
            sc.init(null, new TrustManager[] { new TrustAnyTrustManager() }, new java.security.SecureRandom());
            URL console = new URL(url);
            HttpsURLConnection conn = (HttpsURLConnection) console.openConnection();
            conn.setSSLSocketFactory(sc.getSocketFactory());
//            conn.setHostnameVerifier(new TrustAnyHostnameVerifier());
//            conn.setRequestProperty("contentType", "GBK");
            conn.setRequestMethod("GET");
            conn.connect();
            InputStream is = conn.getInputStream();
//            DataInputStream indata = new DataInputStream(is);
            BufferedReader indata = new BufferedReader(new InputStreamReader(is, "UTF-8"));
            while ((returnValue = indata.readLine()) != null) {
//                System.out.println(returnValue);
                str.append(returnValue);
                str.append("\n");
            }

            conn.disconnect();
        } catch (Exception e) {
            e.printStackTrace();
            throw e;
        } finally {
            try {
                in.close();
            } catch (Exception e) {  }
            try {
                out.close();
            } catch (Exception e) { }
        }

        return str.toString().replace(" ", " ");
    }

 先到这里吧,一点一点尝试过的,不过后来写上来可能有缺漏,听说python好用点,过两天试试

 

 

  • 2
    点赞
  • 19
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

瑶山

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值