java爬http 网站内容
这种方法很多了
1、
public static String doHttpGetHtml(String url) {
//1.生成httpclient,相当于该打开一个浏览器
CloseableHttpClient httpClient = HttpClients.createDefault();
CloseableHttpResponse response = null;
//2.创建get请求,相当于在浏览器地址栏输入 网址
HttpGet request = new HttpGet(url);
try {
//3.执行get请求,相当于在输入地址栏后敲回车键
response = httpClient.execute(request);
//4.判断响应状态为200,进行处理
HttpEntity httpEntity = null;
if(response.getStatusLine().getStatusCode() == HttpStatus.SC_OK) {
//5.获取响应内容
httpEntity = response.getEntity();
String html = EntityUtils.toString(httpEntity, "utf-8");
return html;
} else {
//如果返回状态不是200,比如404(页面不存在)等,根据情况做处理,这里略
LOGGER.error("返回状态不是200");
httpEntity = response.getEntity();
String html = EntityUtils.toString(httpEntity, "utf-8");
return html;
}
} catch (ClientProtocolException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} finally {
//6.关闭
HttpClientUtils.closeQuietly(response);
HttpClientUtils.closeQuietly(httpClient);
}
return null;
}
2、
public static String doHttpGetHtml2(String strUrl) {
try {
URL url=new URL(strUrl);
//通过url建立与网页的连接
URLConnection conn=url.openConnection();
//通过链接取得网页返回的数据
InputStream is=conn.getInputStream();
//一般按行读取网页数据,并进行内容分析
//因此用BufferedReader和InputStreamReader把字节流转化为字符流的缓冲流
//进行转换时,需要处理编码格式问题
BufferedReader br=new BufferedReader(new InputStreamReader(is,"GB2312"));
//按行读取并打印
String line=null;
StringBuffer html = new StringBuffer("");
while((line=br.readLine())!=null){
System.out.println(line);
html.append(line);
}
br.close();
return html.toString();
} catch (Exception e) {
e.printStackTrace();
}
return "null";
}
java爬https 网站内容
下载网站的证书
查看
点击检查——点击security出现如下界面
点击证书路径 我们可以发现证书如下:
下载
使用IE浏览器访问该网站,点击工具——Internet选项
接下来点击导出
下一步
选择保存的目录以及文件名
java命令行将证书生成秘钥库
进入JDK 安装bin目录,命令行运行:
keytool -import -alias Root -file D:/usr/key.cer -keystore “D:/usr/key.keystore” -storepass 123456
查看请求头
代码
public static String getHttps(String url, String charset) {
String html = "";
// 生成httpclient,相当于该打开一个浏览器
CloseableHttpClient httpClient = null;
CloseableHttpResponse response = null;
try {
SSLConnectionSocketFactory sslsf = createSSLConnSocketFactory();
httpClient = HttpClients.custom()
.setSSLSocketFactory(sslsf).build();
// 创建get请求,相当于在浏览器地址栏输入 网址
HttpGet request = new HttpGet(url);
request.addHeader("user-agent","Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.92 Safari/537.36");
RequestConfig config = RequestConfig.custom()
.setSocketTimeout(10000).setConnectTimeout(10000).build();// 设置请求和传输超时时间
request.setConfig(config);
System.out.println("Executing request " + request.getRequestLine());
//3.执行get请求,相当于在输入地址栏后敲回车键
response = httpClient.execute(request);
HttpEntity httpEntity = response.getEntity();
System.out.println("----------------------------------------");
System.out.println(response.getStatusLine());
int resStatu = response.getStatusLine().getStatusCode();// 返回码
//4.判断响应状态为200,进行处理
if (resStatu == HttpStatus.SC_OK) {
//5.获取响应内容
if (httpEntity != null) {
html = EntityUtils.toString(httpEntity, charset);
html = html.replace(" ", " ");
}
} else {
//如果返回状态不是200,比如404(页面不存在)等,根据情况做处理,这里略
System.out.println("返回状态不是200");
System.out.println(EntityUtils.toString(response.getEntity(), charset));
}
EntityUtils.consume(httpEntity);
} catch (Exception e) {
e.printStackTrace();
} finally {
if(response!=null){
try {
response.close();
} catch (IOException e) {
e.printStackTrace();
}
}
if(httpClient!=null){
try {
httpClient.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
return html;
}
// ssl通道证书的创建
private static SSLConnectionSocketFactory createSSLConnSocketFactory()
throws Exception {
SSLContext sslcontext = SSLContexts
.custom()
.loadTrustMaterial(
new File(
"D:\\usr\\key.keystore"),
"123456".toCharArray(), new TrustSelfSignedStrategy()) //文件和密码要对应
.build();
SSLConnectionSocketFactory sslsf = new SSLConnectionSocketFactory(
sslcontext, new String[] { "TLSv1" }, null,
SSLConnectionSocketFactory.getDefaultHostnameVerifier());
return sslsf;
}
另外一种方式
public static String sendHttps(String url) throws Exception {
InputStream in = null;
OutputStream out = null;
String returnValue = "";
StringBuffer str = new StringBuffer("");
try {
Thread.currentThread().sleep(5000);
//SSLContext sc = SSLContext.getInstance("SSL");
// System.setProperty("https.protocols", "TLSv1.2,TLSv1.1,SSLv3");
// SSLContext sc = SSLContext.getInstance("TLS", "SunJSSE");
SSLContext sc = SSLContext.getInstance("TLS");
sc.init(null, new TrustManager[] { new TrustAnyTrustManager() }, new java.security.SecureRandom());
URL console = new URL(url);
HttpsURLConnection conn = (HttpsURLConnection) console.openConnection();
conn.setSSLSocketFactory(sc.getSocketFactory());
// conn.setHostnameVerifier(new TrustAnyHostnameVerifier());
// conn.setRequestProperty("contentType", "GBK");
conn.setRequestMethod("GET");
conn.connect();
InputStream is = conn.getInputStream();
// DataInputStream indata = new DataInputStream(is);
BufferedReader indata = new BufferedReader(new InputStreamReader(is, "UTF-8"));
while ((returnValue = indata.readLine()) != null) {
// System.out.println(returnValue);
str.append(returnValue);
str.append("\n");
}
conn.disconnect();
} catch (Exception e) {
e.printStackTrace();
throw e;
} finally {
try {
in.close();
} catch (Exception e) { }
try {
out.close();
} catch (Exception e) { }
}
return str.toString().replace(" ", " ");
}
先到这里吧,一点一点尝试过的,不过后来写上来可能有缺漏,听说python好用点,过两天试试