一、HttpURLConnection
使用JDK自带的api进行获取数据:
package cool.tdl;
import org.junit.Test;
import java.io.BufferedReader;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.net.HttpURLConnection;
import java.net.URL;
public class JDKAPITest {
@Test
public void TestGet() throws Exception {
//1、确定要访问的URL
URL url = new URL("https://www.cnblogs.com/zhangguangxiang/p/12007924.html#127284060");
//2、获取连接对象
HttpURLConnection URLConnection = (HttpURLConnection) url.openConnection();
//3、设置连接信息
URLConnection.setRequestMethod("GET");
URLConnection.setRequestProperty("User-Agent", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36");//设置请求头
URLConnection.setConnectTimeout(30000);
//4、获取数据
InputStream inputStream = URLConnection.getInputStream();
BufferedReader reader = new BufferedReader(new InputStreamReader(inputStream));
String line = "";
String html = "";
while ((line = reader.readLine()) != null) {
html += line + "\n";
}
System.out.println(html);
//关闭对象
inputStream.close();
reader.close();
}
@Test
public void testPost() throws Exception {
//1、确定要访问的URL
URL url = new URL("https://www.cnblogs.com/zhangguangxiang/p/12007924.html#127284060");
//2、获取连接对象
HttpURLConnection URLConnection = (HttpURLConnection) url.openConnection();
//3、设置连接信息
URLConnection.setDoOutput(true);//允许向url输出内容
URLConnection.setRequestMethod("POST");
URLConnection.setRequestProperty("User-Agent", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36");//设置请求头
URLConnection.setConnectTimeout(30000);
OutputStream outputStream = URLConnection.getOutputStream();
outputStream.write("username=tdl".getBytes());
//4、获取数据
InputStream inputStream = URLConnection.getInputStream();
BufferedReader reader = new BufferedReader(new InputStreamReader(inputStream));
String line = "";
String html = "";
while ((line = reader.readLine()) != null) {
html += line + "\n";
}
System.out.println(html);
//关闭对象
inputStream.close();
reader.close();
}
}
二、HttpClient
使用这个工具进行网络爬取会比JDK自带的爬去方便得多:
1、简单案例
package cool.tdl;
import org.apache.http.NameValuePair;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.message.BasicNameValuePair;
import org.apache.http.util.EntityUtils;
import org.junit.Test;
import java.util.ArrayList;
import java.util.List;
public class Httpclient {
@Test
public void testGet() throws Exception {
//1、创建HTTPclient连接对象
CloseableHttpClient httpClient = HttpClients.createDefault();
//2、创建httpget请求,并进行相关设置
HttpGet httpGet = new HttpGet("https://blog.csdn.net/weixin_45688486/article/details/112691671#gethttpGet_4");
httpGet.setHeader("User-Agent", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36");
//3、执行请求
CloseableHttpResponse response = httpClient.execute(httpGet);
//4、获取响应数据
if (response.getStatusLine().getStatusCode() == 200) {
String html = EntityUtils.toString(response.getEntity(), "UTF-8");
System.out.println(html);
}
//关闭连接对象
httpClient.close();
response.close();
}
@Test
public void testPost() throws Exception {
//1、
CloseableHttpClient httpClient = HttpClients.createDefault();
//2、
HttpPost httpPost = new HttpPost("https://blog.csdn.net/weixin_45688486/article/details/112691671#gethttpGet_4");
//3、
httpPost.setHeader("User-Agent", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36");
List<NameValuePair> list=new ArrayList<NameValuePair>();//设置参数数组
list.add(new BasicNameValuePair("tdl","java"));
UrlEncodedFormEntity entity = new UrlEncodedFormEntity(list,"UTF-8");
httpPost.setEntity(entity);
//3、执行请求
CloseableHttpResponse response = httpClient.execute(httpPost);
//4、获取响应数据
if (response.getStatusLine().getStatusCode() == 200) {
String html = EntityUtils.toString(response.getEntity(), "UTF-8");
System.out.println(html);
}
//关闭连接对象
httpClient.close();
response.close();
}
}
2、HttpClient连接池
@Test
public void testPool() throws Exception{
//1、创建httpclient连接管理器
PoolingHttpClientConnectionManager cm = new PoolingHttpClientConnectionManager();
//2、设置参数
cm.setMaxTotal(200);//最大连接数
cm.setDefaultMaxPerRoute(20);//每个主机的最大并发
doGet(cm);
doGet(cm);//这里两次的httpclient不一样
}
private void doGet(PoolingHttpClientConnectionManager cm)throws Exception {
//3、从连接池中获取httpclient对象
CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(cm).build();
//4、创建httpget对象
HttpGet httpGet = new HttpGet("https://blog.csdn.net/weixin_45688486/article/details/112691671#gethttpGet_4");
httpGet.setHeader("User-Agent", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36");
//5、执行请求
CloseableHttpResponse response = httpClient.execute(httpGet);
//6、获取响应数据
if (response.getStatusLine().getStatusCode() == 200) {
String html = EntityUtils.toString(response.getEntity(), "UTF-8");
System.out.println(html);
}
//7、关闭连接对象
// httpClient.close();//不用关闭,因为使用了连接池
response.close();
}
3、创建请求配置对象
@Test
public void testConfig() throws Exception{
//0、创建请求配置对象
RequestConfig requestConfig = RequestConfig.custom()
.setSocketTimeout(10000)//连接超时时间
.setConnectTimeout(10000)//创建连接超时
.setConnectionRequestTimeout(10000)//请求超时
.setProxy(new HttpHost("61.133.87.228",55443))
.build();
//1、创建HTTPclient连接对象
// CloseableHttpClient httpClient = HttpClients.createDefault();
CloseableHttpClient httpClient = HttpClients.custom().setDefaultRequestConfig(requestConfig).build();
//2、创建httpget请求,并进行相关设置
HttpGet httpGet = new HttpGet("http://www.itcast.cn/");
httpGet.setHeader("User-Agent", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36");
//3、执行请求
CloseableHttpResponse response = httpClient.execute(httpGet);
//4、获取响应数据
if (response.getStatusLine().getStatusCode() == 200) {
String html = EntityUtils.toString(response.getEntity(), "UTF-8");
System.out.println(html);
}
//关闭连接对象
httpClient.close();
response.close();
}
4、HttpClient封装
package cool.tdl.utils;
import org.apache.http.HttpHost;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.apache.http.util.EntityUtils;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Random;
public abstract class HttpUtils {
private static PoolingHttpClientConnectionManager cm = null;
private static RequestConfig config = null;
private static List<String> userAgentList = null;
static {
cm = new PoolingHttpClientConnectionManager();
cm.setMaxTotal(200);//最大连接数
cm.setDefaultMaxPerRoute(20);//每个主机的最大并发
config = RequestConfig.custom()
.setSocketTimeout(10000)//连接超时时间
.setConnectTimeout(10000)//创建连接超时
.setConnectionRequestTimeout(10000)//请求超时
// .setProxy(new HttpHost("61.133.87.228", 55443))
.build();
userAgentList=new ArrayList<String>();
userAgentList.add("Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36");
userAgentList.add("Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:90.0) Gecko/20100101 Firefox/90.0");
}
public static String getHtml(String url) {
CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(cm).build();
HttpGet httpGet = new HttpGet(url);
httpGet.setConfig(config);
httpGet.setHeader("User-Agent",userAgentList.get(new Random().nextInt(userAgentList.size())));
CloseableHttpResponse response = null;
try {
response = httpClient.execute(httpGet);
if (response.getStatusLine().getStatusCode() == 200) {
String html="";
if (response.getEntity()!=null){
html = EntityUtils.toString(response.getEntity(), "UTF-8");
}
return html;
}
} catch (IOException e) {
e.printStackTrace();
}finally {
try {
response.close();
} catch (IOException e) {
e.printStackTrace();
}
}
return null;
}
public static void main(String[] args) {
String html = getHtml("https://proxy.mimvp.com/freeopen?proxy=in_hp");
System.out.println(html);
}
}
三、JSoup
1、入门案例
package cool.tdl;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.junit.Test;
public class JsoupTest {
@Test
public void testGetDocument() throws Exception{
Document doc = Jsoup.connect("https://proxy.mimvp.com/freeopen?proxy=in_hp").get();
// System.out.println(doc);
Element title = doc.getElementsByTag("title").first();
String text = title.text();
System.out.println(text);
}
}
2、获取元素
1.getElementsById()//根据id获取
2.getElementsByTag()//根据标签获取
3.getElementsByClass()//根据类选择器获取
4.getElementsByAttribute()//根据属性获取
3、元素操作
4、选择器