package com.thinkyun.community.util.reptile;
import org.apache.commons.io.IOUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.BufferedInputStream;
import java.io.IOException;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.HashMap;
import java.util.Map;
public final class Reptile {
static Logger logger = (Logger) LoggerFactory.getLogger("Reptile");
public static final String GBK = "GBK";
public static final String UTF8 = "UTF-8";
public static final String POST = "POST";
public static final String GET = "GET";
/**
* @param urlPath 网址路径
* @param cookies cookies
* @param element 截取内容的特征
* @param encode 字符编码
* @param requestMethod 请求方式
* @return Map
*/
public static Map<String, Object> request(String urlPath, String cookies, String element, String encode, String requestMethod) {
Map<String, Object> map = new HashMap();
BufferedInputStream inputStream = null;
try {
logger.info("进入爬虫接口");
URL url = new URL(urlPath);
HttpURLConnection connection = (HttpURLConnection) url.openConnection();
//请求方式
connection.setRequestMethod(requestMethod);
connection.setDoOutput(true);
connection.setDoInput(true);
if (cookies != null) {
connection.setRequestProperty("Cookie", cookies);
}
String html = null;
//获取访问状态码
int code = connection.getResponseCode();
map.put("resultCode", code);
//判断访问码,200为访问成功.
if (code == 200) {
//转为BufferedInputStream
inputStream = new BufferedInputStream(connection.getInputStream());
//将流转成byte数组,再转换成html字符串
String txt = new String(IOUtils.toByteArray(inputStream), encode);
//将字符串转换成Document对象,和Jquery操作类似.
Document document = Jsoup.parseBodyFragment(txt);
Element masthead = document.selectFirst(element);
html = masthead.html();//text()方法为去掉样式的纯文本.
int i = 1;
document = Jsoup.parseBodyFragment(html);
//查询所有IMG标签
Elements cover_images = document.select("img");
//获取第一张图片的url地址.
map.put("firstImage", cover_images.get(0).attr("src"));
//文章内容
map.put("content", html);
}
} catch (IOException e) {
logger.info("接口异常");
e.printStackTrace();
} finally {
try {
if (inputStream != null) {
inputStream.close();
}
}catch (Exception e){
e.printStackTrace();
}
logger.info("接口结束");
return map;
}
}
public static void main(String[] args) {
Map map = request("https://mp.weixin.qq.com/s?__biz=MjM5NTY1MjY0MQ==&mid=2650743162&idx=2&sn=3016639b1ef0eed00cc567b1c6117d9a&chksm=befeb43489893d22eae2e6724e2b91304caa60c8d152c9e6bc511366908c2e1d5b1d165a2793&scene=0#rd",
null,
"div#js_content", UTF8, GET);
}
}
<!-- 网页解析 -->
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.11.3</version>
</dependency>