Java通过HttpURLConnection访问页面并解析HTML文件元素;
依赖包
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.8.3</version>
</dependency>
GET带Cookie
/** * GET请求 * @param url * @param cookie * @return */ public String get(String url,List<String> cookie) { log.info("start url:{}",url); BufferedReader in = null; StringBuilder builder = new StringBuilder(); try { URL realUrl = new URL(url); // 打开和URL之间的连接 HttpURLConnection conn = (HttpURLConnection)realUrl.openConnection(); // 设置通用的请求属性 conn.setRequestProperty("accept", "*/*"); conn.setRequestProperty("connection", "Keep-Alive"); conn.setRequestProperty("user-agent", USER_AGENT); if (!CollectionUtils.isEmpty(cookie)) conn.setRequestProperty("Cookie", Joiner.on(";").join(cookie)); conn.connect(); if (conn.getResponseCode() == HttpURLConnection.HTTP_OK) { // 定义 BufferedReader输入流来读取URL的响应 in = new BufferedReader(new InputStreamReader( conn.getInputStream())); String line; while ((line = in.readLine()) != null) { builder.append(line).append("\n"); } } log.info(String.format("end get result: {}", builder.toString())); } catch (Exception e) { e.printStackTrace(); log.error("HttpUtil.get error.", e); } // 使用finally块来关闭输入流 finally { try { if (in != null) { in.close(); } } catch (Exception e2) { log.error("HttpUtil.get finally error.", e2); } } return builder.toString(); }
HTML元素解析
/** * 获取body元素 * @param htmlStr * @return */ public static Element bodyByHtmlStr(String htmlStr) { //解析字符串为Document对象 Document doc = Jsoup.parse(htmlStr); //获取body元素,获取class="fc"的table元素 return doc.body(); }
/**
* 读取本地html文件里的html代码
* @return
*/
public static String toHtmlString(File file) {
// 获取HTML文件流
StringBuffer htmlSb = new StringBuffer();
try {
BufferedReader br = new BufferedReader(new InputStreamReader(
new FileInputStream(file), "unicode"));
while (br.ready()) {
htmlSb.append(br.readLine());
}
br.close();
// 删除临时文件
//file.delete();
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
// HTML文件字符串
String htmlStr = htmlSb.toString();
// 返回经过清洁的html文本
return htmlStr;
}public static void main(String[] args){
String filePath = "D:\\test\\test.html";
//读取.html文件为字符串
String htmlStr = toHtmlString(new File(filePath));
//解析字符串为Document对象
Document doc = Jsoup.parse(htmlStr);
//获取body元素,获取class="fc"的table元素
Elements table = doc.body().getElementsByClass("fc");
//获取tbody元素
Elements children = table.first().children();
//获取tr元素集合
Elements tr = children.get(0).getElementsByTag("tr");
//遍历tr元素,获取td元素,并打印
for(int i=0; i<tr.size(); i++){
Element e1 = tr.get(i);
Elements td = e1.getElementsByTag("td");
for(int j=0; j<td.size(); j++){
String value = td.get(j).text();
System.out.print(" "+value);
}
System.out.println();
}}