java爬虫数据绕过登录界面

最新推荐文章于 2024-07-31 04:30:46 发布

xjitcm

最新推荐文章于 2024-07-31 04:30:46 发布

阅读量1.8k

点赞数 3

分类专栏： java随笔 spring 文章标签： java http 安全 ide

本文链接：https://blog.csdn.net/cm_mc_cm_mc/article/details/119088847

版权

java随笔同时被 2 个专栏收录

9 篇文章 1 订阅

订阅专栏

spring

6 篇文章 0 订阅

订阅专栏

描述

在用java进行爬虫时，直接用HTTPClient访问获取相关网站数据时，网站会反馈一个登录看界面的信息，如某*东。

如图为网站反馈的登录信息

在这里插入图片描述
打开链接，发现这是爬虫某东的登录界面，者并不是我们需要的html界面

分析

直接用HTTPClient访问，我们的访问头会是空的，网站会以为有人在攻击。

解决方案

这时候我们需要为我们的访问头加上相关信息，也就时模拟我们的访问是用浏览器进行访问的，使用浏览器代理代理访问的，而不是采用空白攻击的手段。
为我们的访问设置访问头，也就数模拟我们在使用浏览器在进行访问爬虫：
httpGet.setHeader(“User-Agent”,“Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36”);

package jd.util;

import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.apache.http.util.EntityUtils;
import org.springframework.stereotype.Component;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.util.UUID;

@Component
public class HttpUtils {
    private PoolingHttpClientConnectionManager cm;
    private RequestConfig getConfig(){
        RequestConfig config = RequestConfig.custom()
                .setConnectTimeout(1000)
                .setConnectionRequestTimeout(500)
                .setSocketTimeout(10000)
                .build();
        return config;
    }

    public HttpUtils() {
        this.cm = new PoolingHttpClientConnectionManager();
        this.cm.setMaxTotal(100);
        this.cm.setDefaultMaxPerRoute(10);
    }

    //    根据请求地址下载数据
    public String doGet(String url) throws IOException {
        //获取HTTPClient对象
        CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(this.cm).build();
        //创建httpClient请求对象，设置url地址
        HttpGet httpGet = new HttpGet(url);
        //模拟浏览器在访问网站，如果不模拟那么，网站将会认为我们在攻击
        httpGet.setHeader("User-Agent","Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36");
        httpGet.setConfig(this.getConfig());
        //使用HTTPClient发起请求获取响应
        CloseableHttpResponse response = httpClient.execute(httpGet);
        if (response.getStatusLine().getStatusCode() == 200){
            if (response.getEntity() != null){
                String content = EntityUtils.toString(response.getEntity(), "utf8");
                return content;
            } else {
                return "";
            }
        } else {
            response.close();
        }
        return "";
    }
    public String doGetImage(String url) throws IOException {
        {
            //获取HTTPClient对象
            CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(this.cm).build();
            //创建httpClient请求对象，设置url地址
            HttpGet httpGet = new HttpGet(url);
            httpGet.setConfig(this.getConfig());
            //使用HTTPClient发起请求获取响应
            CloseableHttpResponse response = httpClient.execute(httpGet);
            if (response.getStatusLine().getStatusCode() == 200){
                if (response.getEntity() != null){
                    String extName = url.substring(url.lastIndexOf("."));
                    String picName = UUID.randomUUID().toString()+extName;
                    OutputStream outputStream = new FileOutputStream(new File("D:\\Intellij\\crawler\\src\\main\\resources\\imgs\\"+picName));
                    response.getEntity().writeTo(outputStream);
                    return picName;
                }
            } else {
                response.close();
            }
            return "";
        }
    }
}