Java爬虫--http请求

1 首先导入相关的依赖包

<dependency>
	<groupId>org.apache.httpcomponents</groupId>
	<artifactId>httpclient</artifactId>
	<version>4.5.4</version>
</dependency>

<dependency>
	<groupId>org.slf4j</groupId>
	<artifactId>slf4j-log4j12</artifactId>
	<version>1.7.25</version>
</dependency>

<dependency>
    <groupId>org.springframework</groupId>
    <artifactId>spring-context</artifactId>
    <version>3.2.17.RELEASE</version>
</dependency>

2 http请求网页地址

package com.cheng.webb1.http;

import org.apache.http.HttpStatus;
import org.apache.http.NameValuePair;
import org.apache.http.client.CookieStore;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.client.methods.HttpUriRequest;
import org.apache.http.impl.client.BasicCookieStore;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.message.BasicNameValuePair;
import org.apache.http.util.EntityUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.stereotype.Component;

import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;

/**
 * HTTP 请求抽象函数
 * @author shucheng 
 * @creation 2019年1月28日上午9:53:01
 * @param <T>
 */
@Component
public abstract class AbstractHttpReqHandler<T> {

    protected Logger logger = LoggerFactory.getLogger(getClass());


    /**
     * 浏览器user-agent
     */
    private static final String USER_AGENT = "Mozilla/5.0 (Windows NT 6.1; WOW64)"
            + " AppleWebKit/537.36 (KHTML, like Gecko)" + " Chrome/58.0.3013.3 Safari/537.36";
    /**
     * 默认字符编码为: utf-8
     */
    private String charset = "utf-8";

    public String getCharset() {
        return charset;
    }

    public void setCharset(String charset) {
        this.charset = charset;
    }

    public void setValue(Object value) {

    }

    /**
     * Get方式获取
     *
     * @param url url地址
     * @return T
     */
    public T get(String url) {
        if (logger.isDebugEnabled()) {
            logger.info(url);
        }
        CookieStore cookieStore = new BasicCookieStore();
        CloseableHttpClient client = HttpClients.custom().setDefaultCookieStore(cookieStore).build();

        String html = "";
        try {
            HttpGet byGet = new HttpGet(url);
            setUserAgent(byGet);

            CloseableHttpResponse response = client.execute(byGet);
            // 获取成功
            if (response.getStatusLine().getStatusCode() == HttpStatus.SC_OK) {
                html = EntityUtils.toString(response.getEntity(), charset);
            } else {
                logger.info("Fail(Get): {}[{}] -> {}", url, response.getStatusLine().getStatusCode(),
                        response.getStatusLine().getReasonPhrase());
            }
            response.close();
        } catch (Exception e) {
            e.printStackTrace();
            logger.error("{}->{}", url, e);
        } finally {
            try {
                client.close();
            } catch (Exception e) {
                //do nothing
            }
        }
        return parse(html);
    }

    /**
     * Post方式获取
     *
     * @param url          url地址
     * @param parameterMap 请求参数Map
     * @return T
     */
    public T post(String url, Map<String, Object> parameterMap) {
        if (logger.isDebugEnabled()) {
            logger.info(url);
        }
        CloseableHttpClient client = HttpClients.custom().build();
        String html = "";
        try {
            HttpPost byPost = new HttpPost(url);
            setUserAgent(byPost);
            setReqParams(byPost, parameterMap);

            CloseableHttpResponse response = client.execute(byPost);
            // 获取成功
            if (response.getStatusLine().getStatusCode() == HttpStatus.SC_OK) {
                html = EntityUtils.toString(response.getEntity(), "utf-8");
            } else {
                logger.info("Fail(Post): {}[{}] -> {}", url, response.getStatusLine().getStatusCode(),
                        response.getStatusLine().getReasonPhrase());
            }
            response.close();
        } catch (Exception e) {
            logger.error("{} -> {}", url, e);
        } finally {
            try {
                client.close();
            } catch (Exception e) {
                // do nothing
            }
        }
        return parse(html);
    }

    /**
     * 设置User-Agent
     *
     * @param req {@link HttpUriRequest}
     */
    private static void setUserAgent(HttpUriRequest req) {
        req.setHeader("User-Agent", USER_AGENT);
    }

    /**
     * 设置查询参数
     *
     * @param byPost post
     * @param params 查询参数
     */
    private static void setReqParams(HttpPost byPost, Map<String, Object> params) {
        if (null == params) {
            return;
        }

        List<NameValuePair> nvps = new ArrayList<>(params.size());
        for (Map.Entry<String, Object> entry : params.entrySet()) {
            nvps.add(new BasicNameValuePair(entry.getKey(), String.valueOf(entry.getValue())));
        }
        try {
            byPost.setEntity(new UrlEncodedFormEntity(nvps));
        } catch (UnsupportedEncodingException e) {
            e.printStackTrace();
        }
    }

    /**
     * 将html页面解释成对象
     *
     * @param html html页面
     * @return 对象
     */
    protected abstract T parse(String html);

}

3 实体类继承上面抽象类

package com.cheng.webb1.http;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;
import org.junit.Test;

public class SpiderTest extends AbstractHttpReqHandler<Object>{

	@Override
	protected Object parse(String html) {
		Document document = Jsoup.parse(html);
		Elements select = document.select("#123");
		/** 业务逻辑伙计们自己扎主意**/
		System.out.println(select);
		return null;
	}
	
	@Test
	public void test() {
		SpiderTest sp =new SpiderTest();
		sp.get("http://www.xxxx.com/");
	}

}

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值