Java实现,Http带Cookie请求,爬取公共网站数据。

Java实现,Http带Cookie请求,爬取公共网站数据



前言


提示:以下是本篇文章正文内容,下面案例可供参考

一、使用http带Cookie 请求

package com.keepsoft.devbase.web.config;

import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.JSONArray;
import com.github.pagehelper.util.StringUtil;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.utils.URIBuilder;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;

import java.io.*;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URI;
import java.net.URL;
import java.util.List;
import java.util.Map;

/**
 * Copyright:   keLeCc
 *
 * @author: keLeCc
 * @version: V1.0
 * @Date: 2020/11/27 15:50
 */
public class HttpClientConfig {

    public static  String cookie = "";

    /**
     * HttpClient post 请求
     * @param httpUrl
     * @param param
     * @return
     */
    public static String doPOST(String httpUrl, String param) {
        HttpURLConnection connection = null;
        InputStream is = null;
        OutputStream os = null;
        BufferedReader br = null;
        String result = null;
        try {
            URL url = new URL(httpUrl);
            // 通过远程url连接对象打开连接
            connection = (HttpURLConnection) url.openConnection();

             // 设置cookie
            if(StringUtil.isNotEmpty(cookie)){
                connection.setRequestProperty("Cookie", cookie);
            }else {
                cookie = HttpClientConfig.getCookie("http://www.shipxy.com/",null);
                connection.setRequestProperty("Cookie", cookie);
            }
            System.out.println("cookie = ["+cookie+"]");
            // 设置连接请求方式
            connection.setRequestMethod("POST");
            // 设置连接主机服务器超时时间:15000毫秒
            connection.setConnectTimeout(15000);
            // 设置读取主机服务器返回数据超时时间:60000毫秒
            connection.setReadTimeout(60000);

            // 默认值为:false,当向远程服务器传送数据/写数据时,需要设置为true
            connection.setDoOutput(true);
            // 默认值为:true,当前向远程服务读取数据时,设置为true,该参数可有可无
            connection.setDoInput(true);
            // 设置传入参数的格式:请求参数应该是 name1=value1&name2=value2 的形式。
            connection.setRequestProperty("Content-Type", "application/x-www-form-urlencoded");
            // 设置鉴权信息:Authorization: Bearer da3efcbf-0845-4fe3-8aba-ee040be542c0
            connection.setRequestProperty("Authorization", "Bearer da3efcbf-0845-4fe3-8aba-ee040be542c0");
            connection.setRequestProperty("Cookie", cookie);
            // 通过连接对象获取一个输出流
            os = connection.getOutputStream();
            // 通过输出流对象将参数写出去/传输出去,它是通过字节数组写出的
            os.write(param.getBytes());
            // 通过连接对象获取一个输入流,向远程读取
            if (connection.getResponseCode() == 200) {

                is = connection.getInputStream();
                // 对输入流对象进行包装:charset根据工作项目组的要求来设置
                br = new BufferedReader(new InputStreamReader(is, "UTF-8"));

                StringBuffer sbf = new StringBuffer();
                String temp = null;
                // 循环遍历一行一行读取数据
                while ((temp = br.readLine()) != null) {
                    sbf.append(temp);
                    sbf.append("\r\n");
                }
                result = sbf.toString();
            }
        } catch (MalformedURLException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
        } finally {
            // 关闭资源
            if (null != br) {
                try {
                    br.close();
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
            if (null != os) {
                try {
                    os.close();
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
            if (null != is) {
                try {
                    is.close();
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
            // 断开与远程地址url的连接
            connection.disconnect();
        }
        return result;
    }

    /**
     * HttpClient get 请求
     * @param httpUrl
     * @return
     */
    public static String doGet(String httpUrl) {
        HttpURLConnection connection = null;
        InputStream is = null;
        BufferedReader br = null;
        // 返回结果字符串
        String result = null;
        try {
            // 创建远程url连接对象
            URL url = new URL(httpUrl);
            // 通过远程url连接对象打开一个连接,强转成httpURLConnection类
            connection = (HttpURLConnection) url.openConnection();
            // 设置连接方式:get
            connection.setRequestMethod("GET");
            // 设置连接主机服务器的超时时间:15000毫秒
            connection.setConnectTimeout(15000);
            // 设置读取远程返回的数据时间:60000毫秒
            connection.setReadTimeout(60000);
            // 发送请求
            connection.connect();
            // 通过connection连接,获取输入流
            if (connection.getResponseCode() == 200) {
                is = connection.getInputStream();
                // 封装输入流is,并指定字符集
                br = new BufferedReader(new InputStreamReader(is, "UTF-8"));
                // 存放数据
                StringBuffer sbf = new StringBuffer();
                String temp = null;
                while ((temp = br.readLine()) != null) {
                    sbf.append(temp);
                    sbf.append("\r\n");
                }
                result = sbf.toString();
            }
        } catch (MalformedURLException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
        } finally {
            // 关闭资源
            if (null != br) {
                try {
                    br.close();
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }

            if (null != is) {
                try {
                    is.close();
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
            // 关闭远程连接
            connection.disconnect();
        }

        return result;
    }

    /**
     * 通过get请求访问网站首页,返回[cookie]
     * @param url
     * @param param
     * @return
     */
    public static String getCookie(String url,Map<String, String> param) {
        // 创建Httpclient对象
        CloseableHttpClient httpclient = HttpClients.createDefault();
        String cookies = "";
        CloseableHttpResponse response = null;
        try {
            // 创建uri
            URIBuilder builder = new URIBuilder(url);
            if (param != null) {
                for (String key : param.keySet()) {
                    builder.addParameter(key, param.get(key));
                }
            }
            URI uri = builder.build();

            // 创建http GET请求
            HttpGet httpGet = new HttpGet(uri);

            // 执行请求
            response = httpclient.execute(httpGet);
            // 判断返回状态是否为200
            if (response.getStatusLine().getStatusCode() == 200) {
                cookies =  response.getHeaders("Set-Cookie")[0].getValue();
            }
        } catch (Exception e) {
            e.printStackTrace();
        } finally {
            try {
                if (response != null) {
                    response.close();
                }
                httpclient.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
        return cookies;
    }

    /**
     *  解析json,返回list
     * @param json
     * @param jsonKey
     * @return
     */
    public  static  List<Map<String, Object>>  resolveJson(String json,String jsonKey){
        Integer status = JSON.parseObject(json).getInteger("status");
        if(status != 0){
            return  null;
        }
        JSONArray jsonArray = JSON.parseObject(json).getJSONArray(jsonKey);
        String jsonString = jsonArray.toJSONString();
        return JSONArray.parseObject(jsonString,List.class);
    }
}

总结

  • 要注意爬数据的频率
  • Ip是否需要动态代理
  • 失败是否重新请求
您好,关于您的问题,可以使用 JavaHttpURLConnection 类来进行网页爬取,并在请求中设置 Cookie。 具体代码如下: ```java import java.io.BufferedReader; import java.io.IOException; import java.io.InputStreamReader; import java.net.HttpURLConnection; import java.net.URL; import java.util.List; import java.util.Map; public class JavaSpiderDemo { public static void main(String[] args) { String url = "https://www.example.com"; String cookie = "cookie1=value1; cookie2=value2; cookie3=value3";//设置Cookie try { URL obj = new URL(url); HttpURLConnection httpConn = (HttpURLConnection) obj.openConnection(); httpConn.setRequestMethod("GET"); httpConn.setRequestProperty("Cookie", cookie);//设置请求中的Cookie int responseCode = httpConn.getResponseCode(); if (responseCode == HttpURLConnection.HTTP_OK) { BufferedReader in = new BufferedReader(new InputStreamReader(httpConn.getInputStream())); String inputLine; StringBuilder response = new StringBuilder(); while ((inputLine = in.readLine()) != null) { response.append(inputLine); } in.close(); System.out.println("Response:" + response.toString()); //获取响应头 Map<String, List<String>> headerFields = httpConn.getHeaderFields(); for (String key : headerFields.keySet()) { System.out.println(key + ":" + headerFields.get(key)); } } else { System.out.println("GET request not worked"); } } catch (IOException e) { e.printStackTrace(); } } } ``` 如上代码所示,只需在请求的 Header 中设置 Cookie 即可Cookie 进行网页爬取。同时,我们还可以在代码中添加响应头的获取,获取网页返回的响应头信息。 希望以上回答能够解决您的问题。如有任何疑问,欢迎继续追问!
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

可乐cc呀

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值