JAVA爬虫(一)

网络爬虫

网络爬虫(又被称为网页蜘蛛,网络机器人,在FOAF社区中间,更经常的称为网页追逐者),是一种按照一定的规则,自动的抓取万维网信息的程序或者脚本。另外一些不常使用的名字还有蚂蚁,自动索引,模拟程序或者蠕虫。

使用环境

采用环境为:
JDK1.8
IDEA和自带的Maven

导入pom.xml依赖

首先导入http访问

<dependencies>
        <!-- https://mvnrepository.com/artifact/org.apache.httpcomponents/httpclient -->
        <!-- HttpClient -->
        <!--http的访问-->
        <dependency>
            <groupId>org.apache.httpcomponents</groupId>
            <artifactId>httpclient</artifactId>
            <version>4.5.2</version>
        </dependency>
        <!-- https://mvnrepository.com/artifact/org.slf4j/slf4j-log4j12 -->
        <!--日志-->
        <dependency>
            <groupId>org.slf4j</groupId>
            <artifactId>slf4j-log4j12</artifactId>
            <version>1.7.25</version>
            <scope>test</scope>
        </dependency>

    </dependencies>

还需要在resources目录下新建一file文件加入slf4j日志配置
在 resources 目录下创建 log4j.properties 文件, 并添加以下配置

1 在控制台显示日志
log4j.rootLogger=DEBUG,A1
log4j.logger.cn.itcast = DEBUG
log4j.appender.A1=org.apache.log4j.ConsoleAppender
log4j.appender.A1.layout=org.apache.log4j.PatternLayout
log4j.appender.A1.layout.ConversionPattern=%-d{yyyy-MM-dd HH??ss,SSS} [%t] [%c]-[%p] %m%n

创建简单的爬虫程序爬取HTML代码

在java目录下创建目录,创建程序命名
在这里插入图片描述

package cn.itcast.crawler.test;

import org.apache.http.HttpEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import sun.net.www.http.HttpClient;

public class CrawlerFirst {
    public static void main(String[] args) throws  Exception{
        //1.打开游览器,创建HttpClient对象
        CloseableHttpClient httpClient = HttpClients.createDefault();

        //2.输入地址,发起get请求创建HttpGet对象
        HttpGet httpGet =new HttpGet("http://www.itcast.cn");
        //3.按回车,发起请求,返回响应,使用HttpClient对象发起请求
        CloseableHttpResponse response = httpClient.execute(httpGet);

        //4.解析响应,获取数据
        //判断状态码是否为200
        if(response.getStatusLine().getStatusCode()==200)
        {
            HttpEntity httpEntity = response.getEntity();
            String content = EntityUtils.toString(httpEntity, "utf8");
            System.out.println(content);
        }
    }
}

运用Get来获取

这一步需要在pom.xml中注释掉 < scope > test</ scope >

		<dependency>
            <groupId>org.slf4j</groupId>
            <artifactId>slf4j-log4j12</artifactId>
            <version>1.7.25</version>
            <!--<scope>test</scope>-->
        </dependency>
package cn.itcast.crawler.test;

import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;

import java.io.IOException;

public class HttpGetTest {
    public static void main(String[] args) {
        //创建HttpClient对象
        CloseableHttpClient httpClient = HttpClients.createDefault();

        //创建HttpGet对象,设置url访问地址
        HttpGet httpGet = new HttpGet("http://www.itcast.cn");
        //使用HttpClient发起请求,获取response
        CloseableHttpResponse response =null;
        try {
             response = httpClient.execute(httpGet);

            if(response.getStatusLine().getStatusCode()==200)
            {
                String content = EntityUtils.toString(response.getEntity(), "utf8");
                System.out.println(content.length());

            }
        }
        catch (IOException e) {
            e.printStackTrace();
        }
        //关闭访问请求
        finally{
           try {
                response.close();
                httpClient.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
    }
}

运用Get参数来获取

package cn.itcast.crawler.test;

import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.utils.URIBuilder;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;

import java.io.IOException;
import java.net.URISyntaxException;

public class HttpGetTest {
    public static void main(String[] args) throws Exception {
        //创建HttpClient对象
        CloseableHttpClient httpClient = HttpClients.createDefault();

        //设置请求地址是:  http://yun.itheima.com/search?keys=Java
        //创建URIBuider
        URIBuilder uriBuilder= new URIBuilder("http://yun.itheima.com/search");
        //设置参数
        uriBuilder.setParameter("keys","Java");
        //创建HttpGet对象 ,设置url访问地址
        HttpGet httpGet = new HttpGet(uriBuilder.build());
        //使用HttpClient发起请求,获取response
        CloseableHttpResponse response =null;
        try {
             response = httpClient.execute(httpGet);

            if(response.getStatusLine().getStatusCode()==200)
            {
                String content = EntityUtils.toString(response.getEntity(), "utf8");
            }
        }
        catch (IOException e) {
            e.printStackTrace();
        }

        finally{
            try {
                response.close();
                httpClient.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
    }
}

运用Post来获取

只需要把HttpGet对象改为HttpPost对象可以了

package cn.itcast.crawler.test;

import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;

import java.io.IOException;

public class HttpGetTest {
    public static void main(String[] args) {
        //创建HttpClient对象
        CloseableHttpClient httpClient = HttpClients.createDefault();

        //创建HttpGet对象,设置url访问地址
        HttpPost httpPost = new HttpPost("http://www.itcast.cn");
        //使用HttpClient发起请求,获取response
        CloseableHttpResponse response =null;
        try {
             response = httpClient.execute(httpPost);

            if(response.getStatusLine().getStatusCode()==200)
            {
                String content = EntityUtils.toString(response.getEntity(), "utf8");
                System.out.println(content.length());

            }
        }
        catch (IOException e) {
            e.printStackTrace();
        }
        //关闭访问请求
        finally{
            try {
                response.close();
                httpClient.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
    }
}

运用Post请求带参数

package cn.itcast.crawler.test;

import org.apache.http.NameValuePair;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.client.utils.URIBuilder;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.message.BasicNameValuePair;
import org.apache.http.util.EntityUtils;

import java.io.IOException;
import java.util.ArrayList;

public class HttpPostTest {
    public static void main(String[] args) throws Exception {
        //创建HttpClient对象
        CloseableHttpClient httpClient = HttpClients.createDefault();

        //设置请求地址是:  http://yun.itheima.com/search?keys=Java
        //创建HttpPost对象,设置url访问地址
        HttpPost httpPost =new HttpPost("http://yun.itheima.com/search");
        //声明List,封装表单参数
        ArrayList<NameValuePair> params=new ArrayList<NameValuePair>();
        params.add(new BasicNameValuePair("key","Java"));
        //创建表单的Entity对象,第一个参数为封装好的表单数据,第二个参数为编码
        UrlEncodedFormEntity formEntity=new UrlEncodedFormEntity(params,"utf8");
        //设置表单的Entity对象到Post请求中
        httpPost.setEntity(formEntity);

        CloseableHttpResponse response =null;
        try {
             response = httpClient.execute(httpPost);

            if(response.getStatusLine().getStatusCode()==200)
            {
                String content = EntityUtils.toString(response.getEntity(), "utf8");
            }
        }
        catch (IOException e) {
            e.printStackTrace();
        }

        finally{
            try {
                response.close();
                httpClient.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
    }
}

使用连接池来调取HttpClient

package cn.itcast.crawler.test;

import org.apache.http.HttpEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.apache.http.util.EntityUtils;
import sun.net.www.http.HttpClient;

import java.io.IOException;

public class HttpClientPool {
    public static void main(String[] args) {
        //创建连接池管理器
        PoolingHttpClientConnectionManager cm=new PoolingHttpClientConnectionManager();
        //设置连接数
        cm.setMaxTotal(100);
        //设置每个主机的最大连接数
        cm.setDefaultMaxPerRoute(10);

        //使用连接池管理器发起请求
        doGet(cm);
    }

    private static void doGet(PoolingHttpClientConnectionManager cm) {
        //每次执行使用HttpClient,不需要新建而是从连接池中获取 HttpClient对象
        CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(cm).build();

        HttpGet httpGet=new HttpGet("http://www.itcast.cn");

        CloseableHttpResponse response=null;
        try {
             response = httpClient.execute(httpGet);
             if(response.getStatusLine().getStatusCode()==200)
             {
                 String content = EntityUtils.toString(response.getEntity(), "utf8");
                 System.out.println(content);
             }
        } catch (IOException e) {
            e.printStackTrace();
        }finally
        {
            if(response!=null)
            {
                try{
                    response.close();
                } catch (IOException e) {
                    e.printStackTrace();
                }
                // 不能关闭 HttpClient, 由连接池管理 HttpClient
                // httpClient. close();
            }
        }
    }
}

请求参数的设置

就是在请求中设置一些访问的格式和时间

package cn.itcast.crawler.test;

import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import sun.misc.Request;

import java.io.IOException;

public class HttpConfigTest {
    public static void main(String[] args) {
        //创建HttpClient对象
        CloseableHttpClient httpClient = HttpClients.createDefault();

        //创建HttpGet对象,设置url访问地址
        HttpGet httpGet = new HttpGet("http://www.itcast.cn");
        //配置请求信息
        RequestConfig config = RequestConfig.custom().setConnectTimeout(1000)// 创建连接的最长时间, 单位是毫秒
                .setConnectionRequestTimeout(500)   // 设置获取连接的最长时间, 单位是毫秒
                .setSocketTimeout(10*1000)           // 设置数据传输的最长时间, 单位是毫秒
                .build();
        //给请求设置请求信息
        httpGet.setConfig(config);

        //使用HttpClient发起请求,获取response
        CloseableHttpResponse response =null;
        try {
            response = httpClient.execute(httpGet);

            if(response.getStatusLine().getStatusCode()==200)
            {
                String content = EntityUtils.toString(response.getEntity(), "utf8");
                System.out.println(content.length());

            }
        }
        catch (IOException e) {
            e.printStackTrace();
        }
        //关闭访问请求
        finally{
            try {
                response.close();
                httpClient.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
    }
}

视频讲解

视频连接,共同学习进步

  • 1
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 2
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值