import com.crawl.getdemo;
import com.crawl.getdemo2;
import java.io.IOException;
public class SimpleClient {
public static void main(String[] args) throws IOException {
getdemo2 getdemo2 = new getdemo2();
getdemo2.testHttpClientA();
// CloseableHttpClient httpClient = HttpClients.createDefault();
// // 设置代理服务器地址和端口
// //client.getHostConfiguration().setProxy("proxy_host_addr",proxy_port);
// // 使用 GET 方法 ,如果服务器需要通过 HTTPS 连接,那只需要将下面 URL 中的 http 换成 https
// HttpMethod method=new GetMethod("http://java.sun.com");
// //使用POST方法
// //HttpMethod method = new PostMethod("http://java.sun.com");
// client.executeMethod(method);
//
// //打印服务器返回的状态
// System.out.println(method.getStatusLine());
// //打印返回的信息
// System.out.println(method.getResponseBodyAsString());
// //释放连接
// method.releaseConnection();
// }
}
}
getdemo1方法(入门)
获取页面信息
package com.crawl;
import org.apache.http.Header;
import org.apache.http.HttpEntity;
import org.apache.http.NameValuePair;
import org.apache.http.ParseException;
import org.apache.http.client.HttpClient;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClientBuilder;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.message.BasicNameValuePair;
import org.apache.http.util.EntityUtils;
import java.io.*;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.zip.GZIPInputStream;
public class getdemo {
public void testHttpClientA() throws IOException {
//使用默认配置的httpclient
CloseableHttpClient httpClient = HttpClients.createDefault();
//即将访问的url
String url = "http://www.biquge.com.tw/17_17380/";
//get形式的访问
HttpGet httpGet = new HttpGet(url);
//执行请求
CloseableHttpResponse response = null;
try {
response = httpClient.execute(httpGet);
//打印请求的状态码 请求成功为200
System.out.println(response.getStatusLine().getStatusCode());
//打印请求的实体内容 返回json格式
HttpEntity entity = response.getEntity();
//获取所有头信息
Header[] allHeaders = response.getAllHeaders();
for (Header allHeader : allHeaders) {
System.out.println(allHeader.getName());
System.out.println(allHeader.getValue());
System.out.println(allHeader.toString());
}
// 方法一 官方不推荐
// if (entity != null) {
// //输出更详细的抓取内容(html格式)
// System.out.println(EntityUtils.toString(entity, "GBK"));
System.out.println(EntityUtils.toString(entity, "utf-8"));
// }
// 释放资源
// EntityUtils.consume(entity);
//方法二 官方推荐 使用流的形式处理请求结果
if (entity != null) {
InputStream content = entity.getContent();
BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(content,"GBK"));
String line = "";
while ((line = bufferedReader.readLine()) != null) {
System.out.println(line);
}
bufferedReader.close();
}
// GZIPInputStream gzip = null;
// if (entity != null) {
// InputStream content = entity.getContent();
gzip = new GZIPInputStream(content);
// BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(content,"GBK"));
// String line = "";
// while ((line = bufferedReader.readLine()) != null) {
// System.out.println(line);
// }
// bufferedReader.close();
// }
} catch (IOException e) {
e.printStackTrace();
} finally {
response.close();
}
}
}
getdemo2方法
初步处理页面将标签省略
package com.crawl;
import org.apache.http.Header;
import org.apache.http.HttpEntity;
import org.apache.http.NameValuePair;
import org.apache.http.ParseException;
import org.apache.http.client.HttpClient;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.*;
import static java.awt.SystemColor.info;
public class getdemo2 {
public void testHttpClientA() throws IOException {
//使用默认配置的httpclient
CloseableHttpClient httpClient = HttpClients.createDefault();
//即将访问的url
String url = "http://www.biquge.com.tw/17_17380/";
//get形式的访问
HttpGet httpGet = new HttpGet(url);
//执行请求
CloseableHttpResponse response = null;
try {
response = httpClient.execute(httpGet);
//打印请求的状态码 请求成功为200
// System.out.println(response.getStatusLine().getStatusCode());
//打印请求的实体内容 返回json格式
HttpEntity entity = response.getEntity();
//获取所有头信息
// Header[] allHeaders = response.getAllHeaders();
// for (Header allHeader : allHeaders) {
// System.out.println(allHeader.getName());
// System.out.println(allHeader.getValue());
// System.out.println(allHeader.toString());
// }
String content=EntityUtils.toString(entity, "GBK");
Document doc=Jsoup.parse(content); // 解析网页 得到文档对象
Elements h1Elements=doc.getElementsByTag("h1"); // 根据tag名称来查询DOM
Element h1Element=h1Elements.get(0);
String h1=h1Element.text();
System.out.println("题目:"+h1);
Elements authorElements=doc.select( "#info p" );//作者
Element authorElement=authorElements.get(0);
String author=authorElement.text();
System.out.println(author);
Element introElement=doc.getElementById("intro");// 简介
String intro=introElement.text();
System.out.println("简介"+intro);
System.out.println("章节目录");//目录
Elements hrefElements=doc.select("#list dl dd a");
for(Element e:hrefElements){
System.out.println(e.toString());
}
System.out.println(hrefElements.size());
} catch (IOException e) {
e.printStackTrace();
} finally {
response.close();
}
}
}
出现问题及解决
页面中文乱码问题
解决:应使用GBK而非utf-8,
关联代码
System.out.println(EntityUtils.toString(entity, "GBK"));
BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(content,"GBK"));
String content=EntityUtils.toString(entity, "GBK");