Java实现,Http带Cookie请求,爬取公共网站数据
前言
提示:以下是本篇文章正文内容,下面案例可供参考
一、使用http带Cookie 请求
package com.keepsoft.devbase.web.config;
import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.JSONArray;
import com.github.pagehelper.util.StringUtil;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.utils.URIBuilder;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import java.io.*;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URI;
import java.net.URL;
import java.util.List;
import java.util.Map;
/**
* Copyright: keLeCc
*
* @author: keLeCc
* @version: V1.0
* @Date: 2020/11/27 15:50
*/
public class HttpClientConfig {
public static String cookie = "";
/**
* HttpClient post 请求
* @param httpUrl
* @param param
* @return
*/
public static String doPOST(String httpUrl, String param) {
HttpURLConnection connection = null;
InputStream is = null;
OutputStream os = null;
BufferedReader br = null;
String result = null;
try {
URL url = new URL(httpUrl);
// 通过远程url连接对象打开连接
connection = (HttpURLConnection) url.openConnection();
// 设置cookie
if(StringUtil.isNotEmpty(cookie)){
connection.setRequestProperty("Cookie", cookie);
}else {
cookie = HttpClientConfig.getCookie("http://www.shipxy.com/",null);
connection.setRequestProperty("Cookie", cookie);
}
System.out.println("cookie = ["+cookie+"]");
// 设置连接请求方式
connection.setRequestMethod("POST");
// 设置连接主机服务器超时时间:15000毫秒
connection.setConnectTimeout(15000);
// 设置读取主机服务器返回数据超时时间:60000毫秒
connection.setReadTimeout(60000);
// 默认值为:false,当向远程服务器传送数据/写数据时,需要设置为true
connection.setDoOutput(true);
// 默认值为:true,当前向远程服务读取数据时,设置为true,该参数可有可无
connection.setDoInput(true);
// 设置传入参数的格式:请求参数应该是 name1=value1&name2=value2 的形式。
connection.setRequestProperty("Content-Type", "application/x-www-form-urlencoded");
// 设置鉴权信息:Authorization: Bearer da3efcbf-0845-4fe3-8aba-ee040be542c0
connection.setRequestProperty("Authorization", "Bearer da3efcbf-0845-4fe3-8aba-ee040be542c0");
connection.setRequestProperty("Cookie", cookie);
// 通过连接对象获取一个输出流
os = connection.getOutputStream();
// 通过输出流对象将参数写出去/传输出去,它是通过字节数组写出的
os.write(param.getBytes());
// 通过连接对象获取一个输入流,向远程读取
if (connection.getResponseCode() == 200) {
is = connection.getInputStream();
// 对输入流对象进行包装:charset根据工作项目组的要求来设置
br = new BufferedReader(new InputStreamReader(is, "UTF-8"));
StringBuffer sbf = new StringBuffer();
String temp = null;
// 循环遍历一行一行读取数据
while ((temp = br.readLine()) != null) {
sbf.append(temp);
sbf.append("\r\n");
}
result = sbf.toString();
}
} catch (MalformedURLException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} finally {
// 关闭资源
if (null != br) {
try {
br.close();
} catch (IOException e) {
e.printStackTrace();
}
}
if (null != os) {
try {
os.close();
} catch (IOException e) {
e.printStackTrace();
}
}
if (null != is) {
try {
is.close();
} catch (IOException e) {
e.printStackTrace();
}
}
// 断开与远程地址url的连接
connection.disconnect();
}
return result;
}
/**
* HttpClient get 请求
* @param httpUrl
* @return
*/
public static String doGet(String httpUrl) {
HttpURLConnection connection = null;
InputStream is = null;
BufferedReader br = null;
// 返回结果字符串
String result = null;
try {
// 创建远程url连接对象
URL url = new URL(httpUrl);
// 通过远程url连接对象打开一个连接,强转成httpURLConnection类
connection = (HttpURLConnection) url.openConnection();
// 设置连接方式:get
connection.setRequestMethod("GET");
// 设置连接主机服务器的超时时间:15000毫秒
connection.setConnectTimeout(15000);
// 设置读取远程返回的数据时间:60000毫秒
connection.setReadTimeout(60000);
// 发送请求
connection.connect();
// 通过connection连接,获取输入流
if (connection.getResponseCode() == 200) {
is = connection.getInputStream();
// 封装输入流is,并指定字符集
br = new BufferedReader(new InputStreamReader(is, "UTF-8"));
// 存放数据
StringBuffer sbf = new StringBuffer();
String temp = null;
while ((temp = br.readLine()) != null) {
sbf.append(temp);
sbf.append("\r\n");
}
result = sbf.toString();
}
} catch (MalformedURLException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} finally {
// 关闭资源
if (null != br) {
try {
br.close();
} catch (IOException e) {
e.printStackTrace();
}
}
if (null != is) {
try {
is.close();
} catch (IOException e) {
e.printStackTrace();
}
}
// 关闭远程连接
connection.disconnect();
}
return result;
}
/**
* 通过get请求访问网站首页,返回[cookie]
* @param url
* @param param
* @return
*/
public static String getCookie(String url,Map<String, String> param) {
// 创建Httpclient对象
CloseableHttpClient httpclient = HttpClients.createDefault();
String cookies = "";
CloseableHttpResponse response = null;
try {
// 创建uri
URIBuilder builder = new URIBuilder(url);
if (param != null) {
for (String key : param.keySet()) {
builder.addParameter(key, param.get(key));
}
}
URI uri = builder.build();
// 创建http GET请求
HttpGet httpGet = new HttpGet(uri);
// 执行请求
response = httpclient.execute(httpGet);
// 判断返回状态是否为200
if (response.getStatusLine().getStatusCode() == 200) {
cookies = response.getHeaders("Set-Cookie")[0].getValue();
}
} catch (Exception e) {
e.printStackTrace();
} finally {
try {
if (response != null) {
response.close();
}
httpclient.close();
} catch (IOException e) {
e.printStackTrace();
}
}
return cookies;
}
/**
* 解析json,返回list
* @param json
* @param jsonKey
* @return
*/
public static List<Map<String, Object>> resolveJson(String json,String jsonKey){
Integer status = JSON.parseObject(json).getInteger("status");
if(status != 0){
return null;
}
JSONArray jsonArray = JSON.parseObject(json).getJSONArray(jsonKey);
String jsonString = jsonArray.toJSONString();
return JSONArray.parseObject(jsonString,List.class);
}
}
总结
- 要注意爬数据的频率
- Ip是否需要动态代理
- 失败是否重新请求