JAVA爬虫系列

目录

准备工作

yml

1.入门程序(获取到静态页面)

2.HttpClient---Get

2.1 HttpClient--Get 修改成连接池(直接看这个)

3.HttpClient---Get带参数

3.1 修改成连接池

4.HttpClient---Post

4.1 修改成连接池

5.HttpClient---Post带参数

6.HttpClient-连接池

7.设置请求信息

8.jsoup介绍.

9.jsoup解析url

10.jsoup解析字符串

11.jsoup解析文件

12.所有dom方式获取元素

13.元素中获取数据

14.选择器方式获取元素


准备工作

导入依赖

 <dependency>
            <groupId>org.apache.httpcomponents</groupId>
            <artifactId>httpclient</artifactId>
            <version>4.5.2</version>
        </dependency>

yml

logging:
  level:
    root: info
    com.lrm: debug

1.入门程序(获取到静态页面)

package com.itheima.reggie.utils;


import org.apache.http.HttpEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;

import java.io.IOException;

/**
 * @Author lpc
 **/
public class CrawlerFirst {
    public static void main(String[] args) throws Exception {

        //1.打开浏览器,创建Httpclient对象
        CloseableHttpClient httpClient = HttpClients.createDefault();

       //2.输入网址,发起get请求创建HttpGet对象
        HttpGet httpGet = new HttpGet("https://www.itcast.cn/");

        //3.按回车,发起请求,返回响应,使用Httpclient对象发起请求
        CloseableHttpResponse response = httpClient.execute(httpGet);
        //4.解析响应,获取数据
        //判斯状态码是否是200
        if (response.getStatusLine().getStatusCode()==200){
            HttpEntity httpEntity = response.getEntity();
            //获取前端静态页面
            String content = EntityUtils.toString(httpEntity,"utf8");
            System.out.println(content);
        }


    }
}

2.HttpClient---Get

package com.itheima.reggie.utils;


import org.apache.http.HttpEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;

import java.io.IOException;

/**
 * @Author lpc
 * @Date 2024 03 12 00 23
 **/
public class CrawlerFirst {
    public static void main(String[] args){

        //1.打开浏览器,创建Httpclient对象
        CloseableHttpClient httpClient = HttpClients.createDefault();

       //2.输入网址,发起get请求创建HttpGet对象
        HttpGet httpGet = new HttpGet("https://www.itcast.cn/");

        //3.按回车,发起请求,返回响应,使用Httpclient对象发起请求
        CloseableHttpResponse response = null;

        try {
            response = httpClient.execute(httpGet);
            //4.解析响应,获取数据
            //判斯状态码是否是200
            if (response.getStatusLine().getStatusCode()==200){
                HttpEntity httpEntity = response.getEntity();
                //获取前端静态页面
                String content = EntityUtils.toString(httpEntity,"utf8");
                System.out.println(content.length());
            }
        } catch (IOException e) {
            throw new RuntimeException(e);
        }finally {
            try {
                //关闭response
                response.close();
            } catch (IOException e) {
                throw new RuntimeException(e);
            }
            try {
                //关闭浏览器
                httpClient.close();
            } catch (IOException e) {
                throw new RuntimeException(e);
            }
        }


    }
}

2.1 HttpClient--Get 修改成连接池(直接看这个)

 如果每次请求都要创建HttpClient,会有频繁创建和销毁的问题,可以使用连接池来解决这个问题。·
测试以下代码,并断点查看每次获取的HttpClient都是不一样的。。

package org.example;

import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.apache.http.util.EntityUtils;

import java.io.IOException;

/**
 * @Author lpc
 * @Date 2024 03 14 09 38
 **/
public class Test {

    public static void main(String[] args) {
        //创建连接池
        PoolingHttpClientConnectionManager cm = new PoolingHttpClientConnectionManager();
        //设置最大连接数
        cm.setMaxTotal(100);
        //设置每个主机的最大连接数
        cm.setDefaultMaxPerRoute(10);
        //使用连接池管理器发起请求
        doGet(cm);
    }

    public static void doGet(PoolingHttpClientConnectionManager cm){
        //不是每次创建新的httpClient,而是从连接池中获取HttpClient对象
        CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(cm).build();
        HttpGet httpGet = new HttpGet("http://www.itcast.cn");
        CloseableHttpResponse response=null;
        try {
           response = httpClient.execute(httpGet);
           if (response.getStatusLine().getStatusCode()==200){
               String content = EntityUtils.toString(response.getEntity(), "utf8");
               System.out.println(content.length());
           }
        } catch (IOException e) {
            throw new RuntimeException(e);
        }finally {
           if (response!=null){
               try {
                   response.close();
               } catch (IOException e) {
                   throw new RuntimeException(e);
               }
               //不能关闭,由连接池管理
             //  httpClient.close();
           }
        }

    }
}

3.HttpClient---Get带参数

package org.example;

import org.apache.http.HttpEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.utils.URIBuilder;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;

import java.io.IOException;
import java.net.URISyntaxException;

/**
 * @Author lpc
 * @Date 2024 03 13 20 44
 **/
public class Test2 {
    public static void main(String[] args) throws Exception {

        //1.打开浏览器
        CloseableHttpClient httpClient = HttpClients.createDefault();
        //设置请求地址是: http://yun.itheima.com/search?keys=Java
        //带参数的get方法设置
        //创建URIBuilder
        URIBuilder uriBuilder = new URIBuilder("http://yun.itheima.com/search");
        //设置参数  可以设置多个
        uriBuilder.setParameter("keys","Java");

        //2.输入网址,发起get请求创建HttpGet对象
        HttpGet httpGet = new HttpGet(uriBuilder.build());
        System.out.println("发起请求的信息"+httpGet);
        //3.
        CloseableHttpResponse response=null;
        try {
            response = httpClient.execute(httpGet);
            if (response.getStatusLine().getStatusCode()==200){
                HttpEntity httpEntity = response.getEntity();
                //
                String s = EntityUtils.toString(httpEntity, "utf8");
                System.out.println(s);
            }
        } catch (IOException e) {
            throw new RuntimeException(e);
        }finally {
            try {
                response.close();
            } catch (IOException e) {
                throw new RuntimeException(e);
            }
            try {
                httpClient.close();
            } catch (IOException e) {
                throw new RuntimeException(e);
            }
        }
    }
}

3.1 修改成连接池

4.HttpClient---Post

package org.example;

import org.apache.http.HttpEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.client.utils.URIBuilder;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;

import java.io.IOException;

/**
 * @Author lpc
 * @Date 2024 03 13 20 59
 **/
public class Post {
    public static void main(String[] args) {

        //1.打开浏览器
        CloseableHttpClient httpClient = HttpClients.createDefault();
        //2.输入网址,发起get请求创建HttpGet对象
        //HttpGet httpGet = new HttpGet("https://www.itcast.cn/");
        HttpPost httpPost = new HttpPost("https://www.itcast.cn/");
        //3.
        CloseableHttpResponse response=null;
        try {
           // response = httpClient.execute(httpGet);
            response = httpClient.execute(httpPost);
            if (response.getStatusLine().getStatusCode()==200){
                HttpEntity httpEntity = response.getEntity();
                //
                String s = EntityUtils.toString(httpEntity, "utf8");
                System.out.println(s);
            }
        } catch (IOException e) {
            throw new RuntimeException(e);
        }finally {
            try {
                response.close();
            } catch (IOException e) {
                throw new RuntimeException(e);
            }
            try {
                httpClient.close();
            } catch (IOException e) {
                throw new RuntimeException(e);
            }
        }
    }
}

4.1 修改成连接池

package org.example;

import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.apache.http.util.EntityUtils;

import java.io.IOException;

/**
 * @Author lpc
 * @Date 2024 03 14 10 02
 **/
public class Postl {
    public static void main(String[] args){

        //创建连接池管理器
        PoolingHttpClientConnectionManager cm = new PoolingHttpClientConnectionManager();
        //设置最大连接数
        cm.setMaxTotal(100);
        //设置每个主机最大连接数
        cm.setDefaultMaxPerRoute(10);
        //发起请求
        doPost(cm);
    }

    private static void doPost(PoolingHttpClientConnectionManager cm) {
        //不是每次创建新的httpClient,而是从连接池中获取HttpClient对象
        CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(cm).build();
        //2.输入网址 发起Post请求
        HttpPost httpPost = new HttpPost("http://yun.itheima.com/search");
        CloseableHttpResponse response=null;
        try {
            response = httpClient.execute(httpPost);
            if (response.getStatusLine().getStatusCode()==200){
                String s = EntityUtils.toString(response.getEntity());
                System.out.println(s.length());
            }
        } catch (IOException e) {
            throw new RuntimeException(e);
        }finally {
            if (response!=null){
                try {
                    response.close();
                } catch (IOException e) {
                    throw new RuntimeException(e);
                }

            }
            //不用关闭,由连接池管理
           // httpClient.close();
        }
    }
}

5.HttpClient---Post带参数

package org.example;

import org.apache.http.HttpEntity;
import org.apache.http.NameValuePair;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.client.utils.URIBuilder;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.message.BasicNameValuePair;
import org.apache.http.util.EntityUtils;

import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.List;

/**
 * @Author lpc
 * @Date 2024 03 13 20 59
 **/
public class Post {
    public static void main(String[] args) throws Exception {

        //1.打开浏览器
        CloseableHttpClient httpClient = HttpClients.createDefault();
        //2.输入网址 发起Post请求
        HttpPost httpPost = new HttpPost("http://yun.itheima.com/search");
        //声明List集合,封装表单中的参数
        List<NameValuePair> params =new ArrayList<NameValuePair>();
        //设置请求地址是: http://yun.itheima.com/search?keys=Java
        params.add(new BasicNameValuePair("keys","Java"));
        //创建表单的Entity对象,第一个参数就是封装的表单数据,第二个参数就是编码
        UrlEncodedFormEntity urlEncodedFormEntity = new UrlEncodedFormEntity(params,"utf8");

        //设置表单的Entity对象到Post请求中
        httpPost.setEntity(urlEncodedFormEntity);

        CloseableHttpResponse response=null;
        try {
           // response = httpClient.execute(httpGet);
            response = httpClient.execute(httpPost);
            if (response.getStatusLine().getStatusCode()==200){
                HttpEntity httpEntity = response.getEntity();
                //
                String s = EntityUtils.toString(httpEntity, "utf8");
                System.out.println(s);
            }
        } catch (IOException e) {
            throw new RuntimeException(e);
        }finally {
            try {
                response.close();
            } catch (IOException e) {
                throw new RuntimeException(e);
            }
            try {
                httpClient.close();
            } catch (IOException e) {
                throw new RuntimeException(e);
            }
        }
    }
}

6.HttpClient-连接池

如果每次请求都要创建HttpClient,会有频繁创建和销毁的问题,可以使用连接池来解决这个问题。·
测试以下代码,并断点查看每次获取的HttpClient都是不一样的。。

package org.example;

import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.apache.http.util.EntityUtils;

import java.io.IOException;

/**
 * @Author lpc
 * @Date 2024 03 14 09 38
 **/
public class Test {

    public static void main(String[] args) {
        //创建连接池
        PoolingHttpClientConnectionManager cm = new PoolingHttpClientConnectionManager();
        //设置最大连接数
        cm.setMaxTotal(100);
        //设置每个主机的最大连接数
        cm.setDefaultMaxPerRoute(10);
        //使用连接池管理器发起请求
        doGet(cm);
    }

    public static void doGet(PoolingHttpClientConnectionManager cm){
        //不是每次创建新的httpClient,而是从连接池中获取HttpClient对象
        CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(cm).build();
        HttpGet httpGet = new HttpGet("http://www.itcast.cn");
        CloseableHttpResponse response=null;
        try {
           response = httpClient.execute(httpGet);
           if (response.getStatusLine().getStatusCode()==200){
               String content = EntityUtils.toString(response.getEntity(), "utf8");
               System.out.println(content.length());
           }
        } catch (IOException e) {
            throw new RuntimeException(e);
        }finally {
           if (response!=null){
               try {
                   response.close();
               } catch (IOException e) {
                   throw new RuntimeException(e);
               }
               //不能关闭,由连接池管理
             //  httpClient.close();
           }
        }

    }
}

7.设置请求信息
 

package org.example;

import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.apache.http.util.EntityUtils;

import java.io.IOException;

/**
 * @Author lpc
 * @Date 2024 03 14 09 38
 **/
public class Test {

    public static void main(String[] args) {
        //创建连接池
        PoolingHttpClientConnectionManager cm = new PoolingHttpClientConnectionManager();
        //设置最大连接数
        cm.setMaxTotal(100);
        //设置每个主机的最大连接数
        cm.setDefaultMaxPerRoute(10);
        //使用连接池管理器发起请求
        doGet(cm);
    }

    public static void doGet(PoolingHttpClientConnectionManager cm){
        //不是每次创建新的httpClient,而是从连接池中获取HttpClient对象
        CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(cm).build();
        HttpGet httpGet = new HttpGet("http://www.itcast.cn");
        //配置请求信息
        RequestConfig config=RequestConfig.custom().setConnectTimeout(1000) //创建连接的最长时间,单位是毫秒
                .setConnectionRequestTimeout(500)//设置获取连接的最长时间
                .setSocketTimeout(10*1000)//设置数据传输的最长时间
                .build();
        //给请求设置请求信息
        httpGet.setConfig(config);
        CloseableHttpResponse response=null;
        try {
           response = httpClient.execute(httpGet);
           if (response.getStatusLine().getStatusCode()==200){
               String content = EntityUtils.toString(response.getEntity(), "utf8");
               System.out.println(content.length());
           }
        } catch (IOException e) {
            throw new RuntimeException(e);
        }finally {
           if (response!=null){
               try {
                   response.close();
               } catch (IOException e) {
                   throw new RuntimeException(e);
               }
               //不能关闭,由连接池管理
             //  httpClient.close();
           }
        }

    }
}

8.jsoup介绍.

jsoup是一款Java 的 HTML解析器,可直接解析某个URL地址、HTML文木内容。它提供了一套非常省力的API,可通过DOM,CSS以及类似于jQuery的操作方法来取出和操作数据。.

jsoup 的主要功能如下:

1.从一个 URL,文件或字符串中解析HTML;

2.使用DOM或CSS选择器来查找、取出数据;

3.可操作HTML元素、属性、文本;·

依赖

<!-- https://mvnrepository.com/artifact/org.jsoup/jsoup -->
<dependency>
    <groupId>org.jsoup</groupId>
    <artifactId>jsoup</artifactId>
    <version>1.13.1</version>
</dependency>

<dependency>
    <groupId>junit</groupId>
    <artifactId>junit</artifactId>
    <version>4.12</version>
    <scope>test</scope>
</dependency>


<!-- https://mvnrepository.com/artifact/commons-io/commons-io -->
<dependency>
    <groupId>commons-io</groupId>
    <artifactId>commons-io</artifactId>
    <version>2.4</version>
</dependency>


<!-- lang3 -->
                 <dependency>
                     <groupId>org.apache.commons</groupId>
                     <artifactId>commons-lang3</artifactId>
                     <version>3.8.1</version>
                 </dependency>

9.jsoup解析url

package jsoup;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.junit.Test;

import java.net.MalformedURLException;
import java.net.URL;

/**
 * @Author lpc
 * @Date 2024 03 14 10 44
 **/
public class jsoupTestFirst {

    @Test
    public void testJsoupUrl() throws Exception {
        //解析URL地址
        Document parse = Jsoup.parse(new URL("http://www.itcast.cn"), 10*1000);

        //获取title的内容
        Element title = parse.getElementsByTag("title").first();
        System.out.println(title.text());

    }



}

10.jsoup解析字符串

package jsoup;

import org.apache.commons.io.FileUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.junit.Test;

import java.io.File;
import java.net.MalformedURLException;
import java.net.URL;

/**
 * @Author lpc
 * @Date 2024 03 14 10 44
 **/
public class jsoupTestFirst {

    @Test
    public void testString() throws Exception {
        //使用工具读取文件,获取字符串
        String file = FileUtils.readFileToString(new File("D:\\file.html"), "utf8");
        //解析字符串
        Document document = Jsoup.parse(file);
        //获取title的内容
       String title = document.getElementsByTag("title").first().text();
        System.out.println(title);

    }



}

11.jsoup解析文件

@Test
    public void testFile() throws Exception {
        //解析文件
        Document parse = Jsoup.parse(new File("D:\\file.html"), "utf8");
        String title = parse.getElementsByTag("title").first().text();
        System.out.println(title);

    }

12.所有dom方式获取元素

 @Test
    public void testDom() throws Exception {
        //解析文件,获取Document对象
        Document parse = Jsoup.parse(new File("D:\\file.html"), "utf8");
        //获取元素
        //1.
        //Element elementById = parse.getElementById("popupMenu");
        //2.
        //Element elementById=parse.getElementsByTag("span").first();
        //3.
       // Elements elementById = parse.getElementsByClass("city_nav");
        //4.
        Elements elementById=parse.getElementsByAttribute("abc");

        System.out.println(elementById.text());

    }

13.元素中获取数据

 @Test
    public void testData() throws Exception {
        //解析文件
        Document parse = Jsoup.parse(new File("D:\\file.html"), "utf8");
        //根据id获取元素
        Element elementById = parse.getElementById("test");
        System.out.println(elementById);
        //1.从元素中获取id
        String str1=elementById.id();
        System.out.println(str1);
        //2.从元素中获取className
        String str2=elementById.className();
        System.out.println(str2);
        //3.从元素获取attr的值
        String str3=elementById.attr("id");
        System.out.println(str3);
        //4。从元素中获取所有属性
        Attributes attributes = elementById.attributes();
        System.out.println(attributes);
        //5.从元素中获取文本内容
        String str4=elementById.text();
        System.out.println(str4);

    }

14.选择器方式获取元素

 @Test
    public void testSeletor() throws Exception {
        //解析文件
        Document parse = Jsoup.parse(new File("D:\\file.html"), "utf8");
        //tagname: 通过标签查找元素,比如: span
      //  parse.select("span").forEach(System.out::println);


        // id:通过ID查找元素,比如l:# city_bj
       /* Element test = parse.select("test").first();
        System.out.println(test);*/

        //.class:通过class 名称查找元素,比如: .class_a
        /*Element inner = parse.select("inner").first();
        System.out.println(inner);*/


        // [attribute]:利用属性查找元素,比如:[abc]
       /* Element first = parse.select("[abc]").first();
        System.out.println(first.text());*/


        // [attr=value]:利用属性值来查找元素,比如: [class=s_name]
        parse.select("[class=s_name]").forEach(System.out::println);



    }

15.Selector选择器组合使用

@Test
    public void testSeletor2() throws Exception {
        //解析文件
        Document parse = Jsoup.parse(new File("D:\\file.html"), "utf8");
        //el#id:元素+ID,比如:63#city_bj
        Element element = parse.select("h3city_id").first();
        // el.class:元素+class,比如:li.class_au
        Element element1 = parse.select("li.class_a").first();
        // el[attr]:元素+属性名,比如:span[abc]
        Element first = parse.select("span[abc]").first();
        System.out.println(first);
        // 任意组合:比如: span[abc].s_name.
        Element first2 = parse.select("span[abc].s_name").first();
        System.out.println(first2);

        // ancestor child:查找某个元素下子元素,比如: .city_con li查找"city_con"下的所有li
        parse.select(".city_con li").forEach(System.out::println);

        //eparent > child:查找某个父元素下的直接子元素,比如:,
        //.city_con > ul > li查找city_con第一级(直接子元素)的ul,再找所有ul下的第一级li
        Elements select = parse.select(".city_con > ul > li");
        // parent >*:查找某个父元素下所有直接子元素,
        Elements select1 = parse.select(".city_con > ul > *");


    }

16.

  • 27
    点赞
  • 19
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值