一原理
创建HttpClient对象,并指定url,如需要get请求请创建HttpGet对象,post请求请创建HttpPost对象。HttpClient中execute方法发送请求。
二小例子
package com.xiang;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import org.apache.http.client.config.CookieSpecs;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
public class Spider {
public static void main(String[] args) {
// HttpClient 超时配置
RequestConfig requestConfig = RequestConfig.custom()
.setCookieSpec(CookieSpecs.STANDARD)
.setConnectionRequestTimeout(6000).setConnectTimeout(6000)
.build();
CloseableHttpClient httpClient = HttpClients.custom()
.setDefaultRequestConfig(requestConfig).build();
//for (int i = 0; i < 100; i++) {//页面上有页码用到,提高效率,并用多线程
HttpGet httpGet = new HttpGet("http://www.baidu.com");//此处填写地址 创建一个get请求
httpGet.addHeader(
"User-Agent",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.152 Safari/537.36");
CloseableHttpResponse response = null;
InputStream in = null;
try {
// 不敢爬太快,封ip就不好了
//Thread.sleep(3600);
response = httpClient.execute(httpGet);
in = response.getEntity().getContent();
String html = convertStreamToString(in);
new Thread(new BaiduParser(html)).start();
} catch (Exception e) {
//do nothing
}finally{
try {
if(response != null){
response.close();
}
} catch (IOException e) {
// do nothing
}
}
//}
}
//将爬到的内容转化为String
private static String convertStreamToString(InputStream in) {
BufferedReader reader = new BufferedReader(new InputStreamReader(in));
StringBuilder sb = new StringBuilder();
String line = null;
try {
while ((line = reader.readLine()) != null) {
sb.append(line + "\n");
}
} catch (IOException e) {
e.printStackTrace();
} finally {
try {
in.close();
} catch (IOException e) {
e.printStackTrace();
}
}
return sb.toString();
}
}
package com.xiang;
public class BaiduParser implements Runnable{
String html;
public BaiduParser(String html) {
this.html = html;
}
public void run() {
System.out.println(html);
//通过正则表达式或截取取得自己想要的内容
}
}