用Java爬取数据(url爬取文件)

Java爬取接口的数据

首先让大家看看爬取的数据结果 46884 条数据

在这里插入图片描述

这是爬出文件ing
在这里插入图片描述
这是收获
在这里插入图片描述
废话不多说直接上代码

因为有数据才能爬文件所以来一段爬取数据的代码先

pom文件添加的依赖包
只添加一下关键的包

   <!--commons-->
        <dependency>
            <groupId>org.apache.commons</groupId>
            <artifactId>commons-lang3</artifactId>
            <version>3.5</version>
        </dependency>
        <dependency>
            <groupId>commons-io</groupId>
            <artifactId>commons-io</artifactId>
            <version>2.5</version>
        </dependency>
        <!--commons-->
        <dependency>
            <groupId>org.apache.commons</groupId>
            <artifactId>commons-lang3</artifactId>
            <version>3.5</version>
        </dependency>
        <!-- MybatisPlus -->
        <dependency>
            <groupId>com.baomidou</groupId>
            <artifactId>mybatis-plus-boot-starter</artifactId>
            <version>3.1.1</version>
        </dependency>
        <!-- Gson -->
        <dependency>
            <groupId>com.google.code.gson</groupId>
            <artifactId>gson</artifactId>
            <version>2.8.5</version>
        </dependency>
        <!-- okhttp -->
        <dependency>
            <groupId>com.squareup.okhttp3</groupId>
            <artifactId>okhttp</artifactId>
            <version>3.14.2</version>
        </dependency>
        <dependency>
            <groupId>cn.hutool</groupId>
            <artifactId>hutool-all</artifactId>
            <version>4.5.16</version>
        </dependency>

关键代码

 /**
     * 
     * @param param1 参数1
     * @param param2 参数2
     * @param param3 参数3
     * 
     */
    private void getDataToLocalDataBase(String param1 , String param2 , String param3) {
        HttpParam httpParam = new HttpParam();
        httpParam.setApiUrl("爬取的网站");
        httpParam.setApiPath("接口地址");
        Map<String, String> parms = new HashMap<>();

        parms.put("param1 ", param1 );
        parms.put("param2 ", param2 );
        parms.put("param3 ", param3 );
        //....更多参数...
        /*     parms.put("strCustName","");*/
        //创建格式化参数
        Gson paramGson = new GsonBuilder().create();
        String requestParam = paramGson.toJson(parms);
        try {
        //post请求
            HttpResult postResult = HttpUtil.post(httpParam, requestParam);
            String result = postResult.getResult();

            int status = postResult.getStatus();
            Gson gson = new Gson();
            if (status == 200) {
                if (!StringUtils.isEmpty(result)) {
                    JsonObject jsonObject = (JsonObject) new JsonParser().parse(result);
                    JsonElement jsonElement = jsonObject.get("result");
                    String newResult = jsonElement.toString();
                    //xxData 与接口值返回相同的实体类  List<xxData>这里也可也是其他类型 按需去做
                    List<xxData> list = gson.fromJson(newResult, new TypeToken<List<xxData>>() {
                    }.getType());

                    log.info("数据有:{}",list.size());
                    if (list != null && list.size() > 0) {
                       //业务代码...把数据插入到本地数据库
                } else {
                    log.info("无数据");
                }
            } else {
               log.error("错误数据{}", result );
            }
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

上面用的工具类

  • HttpParam
import okhttp3.MediaType;


public class HttpParam {
	//编码格式
    public static final MediaType MEDIA_TYPE_JSON = MediaType.parse("application/json; charset=utf-8");

    /**
     * 接口URL
     */
    private String apiUrl;

    /**
     * 接口路径
     */
    private String apiPath;

    /**
     * 读取超时时间
     */
    private int readTimeout = 30 * 1000;

    /**
     * 写入超时时间
     */
    private int writeTimeout = 30 * 1000;

    /**
     * 连接超时时间
     */
    private int connectTimeout = 2 * 1000;

    /**
     * 编码类型
     */
    private MediaType mediaType = MEDIA_TYPE_JSON;

    public String getApiUrl() {
        return apiUrl;
    }

    public void setApiUrl(String apiUrl) {
        this.apiUrl = apiUrl;
    }

    public String getApiPath() {
        return apiPath;
    }

    public void setApiPath(String apiPath) {
        this.apiPath = apiPath;
    }

    public int getReadTimeout() {
        return readTimeout;
    }

    public void setReadTimeout(int readTimeout) {
        this.readTimeout = readTimeout;
    }

    public int getWriteTimeout() {
        return writeTimeout;
    }

    public void setWriteTimeout(int writeTimeout) {
        this.writeTimeout = writeTimeout;
    }

    public int getConnectTimeout() {
        return connectTimeout;
    }

    public void setConnectTimeout(int connectTimeout) {
        this.connectTimeout = connectTimeout;
    }

    public MediaType getMediaType() {
        return mediaType;
    }

    public void setMediaType(MediaType mediaType) {
        this.mediaType = mediaType;
    }
}
  • HttpResult 这个大家接收的可根据需要自定义
public class HttpResult<T> {

    private int status;
    private String result;
    private T resultObject;

    public HttpResult() {
    }

    public HttpResult(int status, String result, T resultObject) {
        this.status = status;
        this.result = result;
        this.resultObject = resultObject;
    }

    public int getStatus() {
        return status;
    }

    public void setStatus(int status) {
        this.status = status;
    }

    public String getResult() {
        return result;
    }

    public void setResult(String result) {
        this.result = result;
    }

    public T getResultObject() {
        return resultObject;
    }

    public void setResultObject(T resultObject) {
        this.resultObject = resultObject;
    }
}

重点来了

  • HttpUtil
import com.google.gson.Gson;
import com.google.gson.GsonBuilder;
import lombok.extern.slf4j.Slf4j;
import okhttp3.*;

import java.io.IOException;
import java.util.Map;
import java.util.concurrent.TimeUnit;


@Slf4j
public class HttpUtil {

    private static Gson gson = new GsonBuilder().serializeNulls().disableHtmlEscaping().create();

    /**
     * get请求
     */
    public static String get(HttpParam restParam) throws Exception {
        String url = restParam.getApiUrl();

        if (restParam.getApiPath() != null) {
            url = url+restParam.getApiPath();
        }
        Request request = new Request.Builder()
                .url(url)
                .get()
                .build();
        return exec(restParam, request).getResult();
    }

    /**
     * get请求
     */
    public static <T> HttpResult<T> get(HttpParam restParam, Class<T> tClass) throws Exception {
        String url = restParam.getApiUrl();

        if (restParam.getApiPath() != null) {
            url = url+restParam.getApiPath();
        }
        Request request = new Request.Builder()
                .url(url)
                .get()
                .build();
        return exec(restParam, request, tClass);
    }

    /**
     * POST请求json数据
     */
    public static <T> HttpResult<T> post(HttpParam restParam, Class<T> tClass) throws Exception {
        String url = restParam.getApiUrl();
        if (restParam.getApiPath() != null) {
            url = url + restParam.getApiPath();
        }
        Request request = new Request.Builder().url(url).build();
        return exec(restParam, request, tClass);
    }

    /**
     * POST请求json数据
     */
    public static <T> HttpResult<T> post(HttpParam restParam, String reqJsonData, Class<T> tClass) throws Exception {
        String url = restParam.getApiUrl();
        if (restParam.getApiPath() != null) {
            url = url+restParam.getApiPath();
        }
        RequestBody body = RequestBody.create(restParam.getMediaType(), reqJsonData);
        Request request = new Request.Builder()
                .url(url).post(body).build();
        return exec(restParam, request, tClass);
    }

    /**
     * POST请求map数据
     */
    public static <T> HttpResult<T> post(HttpParam restParam, Map<String, String> parms, Class<T> tClass) throws Exception {
        String url = restParam.getApiUrl();
        if (restParam.getApiPath() != null) {
            url = url+restParam.getApiPath();
        }
        FormBody.Builder builder = new FormBody.Builder();
        if (parms != null) {
            for (Map.Entry<String, String> entry : parms.entrySet()) {
                builder.add(entry.getKey(), entry.getValue());
            }
        }
        FormBody body = builder.build();
        Request request = new Request.Builder()
                .url(url)
                .post(body)
                .build();
        return exec(restParam, request, tClass);
    }

    /**
     * POST请求map数据 返回结果
     */
    public static <T> HttpResult<T> post(HttpParam restParam,  String reqJsonData) throws Exception {
        String url = restParam.getApiUrl();
        if (restParam.getApiPath() != null) {
            url = url+restParam.getApiPath();
        }
        RequestBody body = RequestBody.create(restParam.getMediaType(), reqJsonData);
        Request request = new Request.Builder()
                .url(url).post(body).build();
        return exec(restParam, request);
    }

    /**
     * 返回值封装成对象
     */
    private static <T> HttpResult<T> exec(
            HttpParam restParam,
            Request request,
            Class<T> tClass) throws Exception {

        HttpResult clientResult = exec(restParam, request);
        String result = clientResult.getResult();
        int status = clientResult.getStatus();

        T t = null;
        if (status == 200) {
            if (result != null && "".equalsIgnoreCase(result)) {
                t = gson.fromJson(result, tClass);
            }
        } else {
            try {
                result = gson.fromJson(result, String.class);
            } catch (Exception ex) {
                ex.printStackTrace();
            }
        }
        return new HttpResult<>(clientResult.getStatus(), result, t);
    }

    /**
     * 执行方法
     */
    private static HttpResult exec(
            HttpParam restParam,
            Request request) throws Exception {

        HttpResult result = null;

        okhttp3.OkHttpClient client = null;
        ResponseBody responseBody = null;
        try {
            client = new okhttp3.OkHttpClient();

            client.newBuilder()
                    .connectTimeout(restParam.getConnectTimeout(), TimeUnit.MILLISECONDS)
                    .readTimeout(restParam.getReadTimeout(), TimeUnit.MILLISECONDS)
                    .writeTimeout(restParam.getWriteTimeout(), TimeUnit.MILLISECONDS);

            Response response = client.newCall(request).execute();
            if (response.isSuccessful()) {
                responseBody = response.body();
                if (responseBody != null) {
                    String responseString = responseBody.string();

                    result = new HttpResult<>(response.code(), responseString, null);
                }
            } else {
                throw new Exception(response.message());
            }
        } catch (Exception ex) {
            throw new Exception(ex.getMessage());
        } finally {
            if (responseBody != null) {
                responseBody.close();
            }
            if (client != null) {
                client.dispatcher().executorService().shutdown();   //清除并关闭线程池
                client.connectionPool().evictAll();                 //清除并关闭连接池
                try {
                    if (client.cache() != null) {
                        client.cache().close();                         //清除cache
                    }
                } catch (IOException e) {
                    throw new Exception(e.getMessage());
                }
            }
        }
        return result;
    }
}

Java爬取文件

爬取数据的就告一段落了 接下来 就是拿这些数据去爬文件了 
其实网上找了很多很多的文章爬文件的都不靠谱,最后借到了协助三行代码搞定了
在这之前还是要导入关键的工具包滴,就是最上面的pom文件包
 URL url = new URL("文件地址");
 File folder = new File("本地存储文件地址");
 FileUtils.copyURLToFile(url , folder);

github地址
看完有用就点个赞吧

  • 6
    点赞
  • 20
    收藏
    觉得还不错? 一键收藏
  • 7
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 7
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值