Java爬虫

一、HttpURLConnection

使用JDK自带的api进行获取数据:

package cool.tdl;

import org.junit.Test;

import java.io.BufferedReader;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.net.HttpURLConnection;
import java.net.URL;


public class JDKAPITest {
    @Test
    public void TestGet() throws Exception {
        //1、确定要访问的URL
        URL url = new URL("https://www.cnblogs.com/zhangguangxiang/p/12007924.html#127284060");
        //2、获取连接对象
        HttpURLConnection URLConnection = (HttpURLConnection) url.openConnection();
        //3、设置连接信息
        URLConnection.setRequestMethod("GET");
        URLConnection.setRequestProperty("User-Agent", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36");//设置请求头
        URLConnection.setConnectTimeout(30000);
        //4、获取数据
        InputStream inputStream = URLConnection.getInputStream();
        BufferedReader reader = new BufferedReader(new InputStreamReader(inputStream));
        String line = "";
        String html = "";
        while ((line = reader.readLine()) != null) {
            html += line + "\n";
        }
        System.out.println(html);

        //关闭对象
        inputStream.close();
        reader.close();
    }

    @Test
    public void testPost() throws Exception {
        //1、确定要访问的URL
        URL url = new URL("https://www.cnblogs.com/zhangguangxiang/p/12007924.html#127284060");
        //2、获取连接对象
        HttpURLConnection URLConnection = (HttpURLConnection) url.openConnection();
        //3、设置连接信息
        URLConnection.setDoOutput(true);//允许向url输出内容
        URLConnection.setRequestMethod("POST");
        URLConnection.setRequestProperty("User-Agent", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36");//设置请求头
        URLConnection.setConnectTimeout(30000);
        OutputStream outputStream = URLConnection.getOutputStream();
        outputStream.write("username=tdl".getBytes());

        //4、获取数据
        InputStream inputStream = URLConnection.getInputStream();
        BufferedReader reader = new BufferedReader(new InputStreamReader(inputStream));
        String line = "";
        String html = "";
        while ((line = reader.readLine()) != null) {
            html += line + "\n";
        }
        System.out.println(html);

        //关闭对象
        inputStream.close();
        reader.close();
    }
}

二、HttpClient

使用这个工具进行网络爬取会比JDK自带的爬去方便得多:

1、简单案例

package cool.tdl;

import org.apache.http.NameValuePair;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.impl.client.CloseableHttpClient;

import org.apache.http.impl.client.HttpClients;
import org.apache.http.message.BasicNameValuePair;
import org.apache.http.util.EntityUtils;
import org.junit.Test;


import java.util.ArrayList;
import java.util.List;


public class Httpclient {
    @Test
    public void testGet() throws Exception {
        //1、创建HTTPclient连接对象
        CloseableHttpClient httpClient = HttpClients.createDefault();
        //2、创建httpget请求,并进行相关设置
        HttpGet httpGet = new HttpGet("https://blog.csdn.net/weixin_45688486/article/details/112691671#gethttpGet_4");
        httpGet.setHeader("User-Agent", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36");
        //3、执行请求
        CloseableHttpResponse response = httpClient.execute(httpGet);
        //4、获取响应数据
        if (response.getStatusLine().getStatusCode() == 200) {
            String html = EntityUtils.toString(response.getEntity(), "UTF-8");
            System.out.println(html);
        }
        //关闭连接对象
        httpClient.close();
        response.close();

    }

    @Test
    public void testPost() throws Exception {
        //1、
        CloseableHttpClient httpClient = HttpClients.createDefault();

        //2、
        HttpPost httpPost = new HttpPost("https://blog.csdn.net/weixin_45688486/article/details/112691671#gethttpGet_4");

        //3、
        httpPost.setHeader("User-Agent", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36");
        List<NameValuePair> list=new ArrayList<NameValuePair>();//设置参数数组
        list.add(new BasicNameValuePair("tdl","java"));
        UrlEncodedFormEntity entity = new UrlEncodedFormEntity(list,"UTF-8");
        httpPost.setEntity(entity);

        //3、执行请求
        CloseableHttpResponse response = httpClient.execute(httpPost);
        //4、获取响应数据
        if (response.getStatusLine().getStatusCode() == 200) {
            String html = EntityUtils.toString(response.getEntity(), "UTF-8");
            System.out.println(html);
        }
        //关闭连接对象
        httpClient.close();
        response.close();
    }
}

2、HttpClient连接池

@Test
    public void testPool() throws Exception{
        //1、创建httpclient连接管理器
        PoolingHttpClientConnectionManager cm = new PoolingHttpClientConnectionManager();
        //2、设置参数
        cm.setMaxTotal(200);//最大连接数
        cm.setDefaultMaxPerRoute(20);//每个主机的最大并发
        doGet(cm);
        doGet(cm);//这里两次的httpclient不一样
    }

    private void doGet(PoolingHttpClientConnectionManager cm)throws Exception {
        //3、从连接池中获取httpclient对象
        CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(cm).build();
        //4、创建httpget对象
        HttpGet httpGet = new HttpGet("https://blog.csdn.net/weixin_45688486/article/details/112691671#gethttpGet_4");
        httpGet.setHeader("User-Agent", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36");
        //5、执行请求
        CloseableHttpResponse response = httpClient.execute(httpGet);
        //6、获取响应数据
        if (response.getStatusLine().getStatusCode() == 200) {
            String html = EntityUtils.toString(response.getEntity(), "UTF-8");
            System.out.println(html);
        }
        //7、关闭连接对象
//        httpClient.close();//不用关闭,因为使用了连接池
        response.close();
    }

3、创建请求配置对象

免费代理服务器平台

 @Test
    public void testConfig() throws Exception{
        //0、创建请求配置对象
        RequestConfig requestConfig =  RequestConfig.custom()
                .setSocketTimeout(10000)//连接超时时间
                .setConnectTimeout(10000)//创建连接超时
                .setConnectionRequestTimeout(10000)//请求超时
                .setProxy(new HttpHost("61.133.87.228",55443))
                .build();
        //1、创建HTTPclient连接对象
//        CloseableHttpClient httpClient = HttpClients.createDefault();
        CloseableHttpClient httpClient = HttpClients.custom().setDefaultRequestConfig(requestConfig).build();
        //2、创建httpget请求,并进行相关设置
        HttpGet httpGet = new HttpGet("http://www.itcast.cn/");
        httpGet.setHeader("User-Agent", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36");
        //3、执行请求
        CloseableHttpResponse response = httpClient.execute(httpGet);
        //4、获取响应数据
        if (response.getStatusLine().getStatusCode() == 200) {
            String html = EntityUtils.toString(response.getEntity(), "UTF-8");
            System.out.println(html);
        }
        //关闭连接对象
        httpClient.close();
        response.close();
    }

4、HttpClient封装

package cool.tdl.utils;

import org.apache.http.HttpHost;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.apache.http.util.EntityUtils;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Random;

public abstract class HttpUtils {
    private static PoolingHttpClientConnectionManager cm = null;
    private static RequestConfig config = null;
    private static List<String> userAgentList = null;

    static {
        cm = new PoolingHttpClientConnectionManager();
        cm.setMaxTotal(200);//最大连接数
        cm.setDefaultMaxPerRoute(20);//每个主机的最大并发
        config = RequestConfig.custom()
                .setSocketTimeout(10000)//连接超时时间
                .setConnectTimeout(10000)//创建连接超时
                .setConnectionRequestTimeout(10000)//请求超时
//                .setProxy(new HttpHost("61.133.87.228", 55443))
                .build();
        userAgentList=new ArrayList<String>();
        userAgentList.add("Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36");
        userAgentList.add("Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:90.0) Gecko/20100101 Firefox/90.0");
    }

    public static String getHtml(String url) {
        CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(cm).build();
        HttpGet httpGet = new HttpGet(url);
        httpGet.setConfig(config);
        httpGet.setHeader("User-Agent",userAgentList.get(new Random().nextInt(userAgentList.size())));
        CloseableHttpResponse response = null;
        try {
            response = httpClient.execute(httpGet);
            if (response.getStatusLine().getStatusCode() == 200) {
                String html="";
                if (response.getEntity()!=null){
                    html = EntityUtils.toString(response.getEntity(), "UTF-8");
                }
                return html;
            }
        } catch (IOException e) {
            e.printStackTrace();
        }finally {
            try {
                response.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
        return null;
    }
    
    public static void main(String[] args) {
        String html = getHtml("https://proxy.mimvp.com/freeopen?proxy=in_hp");
        System.out.println(html);
    }
}

三、JSoup

jsoup中文帮助文档

1、入门案例

package cool.tdl;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.junit.Test;

public class JsoupTest {
    @Test
    public void testGetDocument() throws Exception{
        Document doc = Jsoup.connect("https://proxy.mimvp.com/freeopen?proxy=in_hp").get();
//        System.out.println(doc);
        Element title = doc.getElementsByTag("title").first();
        String text = title.text();
        System.out.println(text);
    }
}

2、获取元素

1.getElementsById()//根据id获取
2.getElementsByTag()//根据标签获取
3.getElementsByClass()//根据类选择器获取
4.getElementsByAttribute()//根据属性获取

3、元素操作

在这里插入图片描述

4、选择器

在这里插入图片描述
在这里插入图片描述

5、选择器组合

在这里插入图片描述
在这里插入图片描述
正则表达式教程

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值