Jsoup GET POST爬取数据

1 依赖

<dependency>
  <!-- jsoup HTML parser library @ https://jsoup.org/ -->
  <groupId>org.jsoup</groupId>
  <artifactId>jsoup</artifactId>
  <version>1.13.1</version>
</dependency>

2 Get 请求

2.1 HTML
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;

...

public JSONObject doGet(String paramUrl) {
    try {
        Document doc = Jsoup.connect(paramUrl)
        					.ignoreContentType(true)
                			.header("User-Agent", "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.116 Safari/537.36 QBCore/4.0.1295.400 QQBrowser/9.0.2524.400 Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2875.116 Safari/537.36 NetType/WIFI MicroMessenger/7.0.5 WindowsWechat")
               			    .timeout(10000)
                			.get();
        String returnStr = doc.getElementsByClass("标签名").text();
        JSONObject reqResJson = changeJson(returnStr);  // 转换成 json 数据
        return reqResJson;
    }
    catch (Exception e) {
		e.printStackTrace();
    }
    return null;
}
2.2 Json
import com.alibaba.fastjson.JSONObject;
import org.jsoup.Connection.Method;
import org.jsoup.Connection.Response;
import org.jsoup.Jsoup;

...

public JSONObject doGet(String url) {
	try {
		Response response = Jsoup.connect(url)
								 .ignoreContentType(true)
		 						 .header("Content-type", "application/x-www-form-urlencoded; charset=UTF-8")
								 .header("x-forwarded-for", IpUtil.getRandomIp())  // IpUtil 是自己封装的工具类
								 .method(Method.GET)
							   	 .timeout(10000)
								 .execute();
		if (response.statusCode() == 200) {
			String bodyStr = response.body();
	        return JSONObject.parseObject(bodyStr);
	    }
	}
	catch (Exception e) {
		e.printStackTrace();
	}
	return null;
}

3 Post 请求

3.1 application/x-www-form-urlencoded
import com.alibaba.fastjson.JSONObject;
import org.jsoup.Connection.Method;
import org.jsoup.Connection.Response;
import org.jsoup.Jsoup;


public JSONObject doPost(Map<String, String> param, String url) {
        try {
            Response response = Jsoup.connect(url)
            						 .ignoreContentType(true)
                    				 .header("Content-type", "application/x-www-form-urlencoded")
                    				 .userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36")
                    				 .header("x-forwarded-for", IpUtil.getRandomIp())  // IpUtil 是自己封装的工具类
                   					 .method(Method.POST)
                    				 .data(param)
                    				 .timeout(25000)
                    				 .execute();
            if (response.statusCode() == 200) {
                String body = response.body();
                return JSONObject.parseObject(body);
            }
        }
        catch (Exception e) {
			e.printStackTrace();
        }
        return null;
  }
3.2 text/plain
import com.alibaba.fastjson.JSONObject;
import org.jsoup.Connection.Method;
import org.jsoup.Connection.Response;
import org.jsoup.Jsoup;

/**
 * 
 * @param param 数据格式:key1=value1&key2=value2
 * @param url 
 */
public JSONObject doPost(String param, String url) {
        try {
            Response response = Jsoup.connect(url)
            		.ignoreContentType(true)
                    .header("Content-type", "text/plain")
                    .userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36")
                    .header("x-forwarded-for", IpUtil.getRandomIp())  // IpUtil 是自己封装的工具类
                    .method(Method.POST)
                    .requestBody(param)
                    .timeout(15000)
                    .execute();
            if (response.statusCode() == 200) {
                String body = response.body();
                return JSONObject.parseObject(body);
            }
        }
        catch (Exception e) {
			e.printStackTrace();
        }
        return null;
}
3.3 application/json
import com.alibaba.fastjson.JSONObject;
import org.jsoup.Connection.Method;
import org.jsoup.Connection.Response;
import org.jsoup.Jsoup;

...

public JSONObject doPost(String paramJsonStr, Proxy proxy, url) {
        try {
            Response response = Jsoup.connect(url)
                    				 .ignoreContentType(true)
                    				 .header("Content-Type", "application/json;charset=UTF-8")
                    				 .header("User-Agent", "Mozilla/5.0 (Linux; Android 5.1.1; sm-j200g Build/LMY48Z) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/39.0.0.0 Mobile Safari/537.36")
                   					 .requestBody(paramJsonStr)
                  					 .proxy(proxy)  // 代理 IP
                    				 .method(Method.POST)
                    				 .timeout(10000)
                    				 .execute();
            if (response.statusCode() == 200) {
                String bodyStr = response.body();
                return JSONObject.parseObject(bodyStr);
            }
        }
        catch (Exception e) {
			e.printStackTrace();
        }
        return null;
}
3.4 HTML
    private JSONObject doPost(Map<String, String> param, String cookie) {
        try {
            Document doc = Jsoup.connect("http://*****").ignoreContentType(true)
                    .header("Content-Type", "application/x-www-form-urlencoded")
                    .userAgent("Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.116 Safari/537.36 QBCore/4.0.1301.400 QQBrowser/9.0.2524.400 Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2875.116 Safari/537.36 NetType/WIFI MicroMessenger/7.0.5 WindowsWechat")
                    .data(param)
                    .header("cookie", cookie)
                    .timeout(10000)
                    .post();
            JSONObject reqResJson = new JSONObject();
            Elements elements = doc.getElementsByTag("li");
        	// todo 数据格式转换;
            reqResJson = change(elements, reqResJson);
            return reqResJson;
        }
        catch (Exception e) {
            // todo 数据格式转换;
        }
        return reqResJson
    }

4 IpUtil

import java.util.Random;

/**
 * 生成ip工具类
 * @author pkyShare
 */
public class IpUtil {
    /**
     * IP 范围
     */
    private static final int[][] RANGE = {
            // 36.56.0.0-36.63.255.255
            { 607649792, 608174079 },
            // 61.232.0.0-61.237.255.255
            { 1038614528, 1039007743 },
            // 106.80.0.0-106.95.255.255
            { 1783627776, 1784676351 },
            // 121.76.0.0-121.77.255.255
            { 2035023872, 2035154943 },
            // 123.232.0.0-123.235.255.255
            { 2078801920, 2079064063 },
            // 139.196.0.0-139.215.255.255
            { -1950089216, -1948778497 },
            // 171.8.0.0-171.15.255.255
            { -1425539072, -1425014785 },
            // 182.80.0.0-182.92.255.255
            { -1236271104, -1235419137 },
            // 210.25.0.0-210.47.255.255
            { -770113536, -768606209 },
            // 222.16.0.0-222.95.255.255
            { -569376768, -564133889 },
    };
    
    /**
     * 获取随机ip
     * @return IP字符串
     */
    public static String getRandomIp() {
        int index = new Random().nextInt(10);
        return num2ip(RANGE[index][0] + new Random().nextInt(RANGE[index][1] - RANGE[index][0]));
    }

    /**
     * 将十进制转换成 ip 地址
     * @return IP字符串
     */
    public static String num2ip(int ip) {
        int[] b = new int[4];
        b[0] = ((ip >> 24) & 0xff);
        b[1] = ((ip >> 16) & 0xff);
        b[2] = ((ip >> 8) & 0xff);
        b[3] = (ip & 0xff);
        return b[0] + "." + b[1] + "." + b[2] + "." + b[3];
    }
}

  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 2
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值