Java通过HttpURLConnection访问页面并解析HTML文件元素

最新推荐文章于 2024-04-23 17:29:36 发布

奋斗者ing

最新推荐文章于 2024-04-23 17:29:36 发布

阅读量923

点赞数

分类专栏： url html 文章标签： java html

本文链接：https://blog.csdn.net/liaonanfeng88/article/details/127994049

版权

url 同时被 2 个专栏收录

2 篇文章 0 订阅

订阅专栏

html

2 篇文章 0 订阅

订阅专栏

Java通过HttpURLConnection访问页面并解析HTML文件元素；

依赖包

<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.8.3</version>
</dependency>

GET带Cookie

/**
 * GET请求
 * @param url
 * @param cookie
 * @return
 */
public String get(String url,List<String> cookie) {
    log.info("start url:{}",url);
    BufferedReader in = null;
    StringBuilder builder = new StringBuilder();
    try {
        URL realUrl = new URL(url);
        // 打开和URL之间的连接
        HttpURLConnection conn = (HttpURLConnection)realUrl.openConnection();
        // 设置通用的请求属性
        conn.setRequestProperty("accept", "*/*");
        conn.setRequestProperty("connection", "Keep-Alive");
        conn.setRequestProperty("user-agent", USER_AGENT);
        if (!CollectionUtils.isEmpty(cookie))
            conn.setRequestProperty("Cookie", Joiner.on(";").join(cookie));

        conn.connect();
        if (conn.getResponseCode() == HttpURLConnection.HTTP_OK) {
            // 定义 BufferedReader输入流来读取URL的响应
            in = new BufferedReader(new InputStreamReader(
                    conn.getInputStream()));
            String line;
            while ((line = in.readLine()) != null) {
                builder.append(line).append("\n");
            }
        }
        log.info(String.format("end get result: {}", builder.toString()));
    } catch (Exception e) {
        e.printStackTrace();
        log.error("HttpUtil.get error.", e);
    }
    // 使用finally块来关闭输入流
    finally {
        try {
            if (in != null) {
                in.close();
            }
        } catch (Exception e2) {
            log.error("HttpUtil.get finally error.", e2);
        }
    }
    return builder.toString();
}

HTML元素解析

/**
 * 获取body元素
 * @param htmlStr
 * @return
 */
public static Element bodyByHtmlStr(String htmlStr) {
    //解析字符串为Document对象
    Document doc = Jsoup.parse(htmlStr);
    //获取body元素，获取class="fc"的table元素
    return doc.body();
}
/**
* 读取本地html文件里的html代码
* @return
*/
public static String toHtmlString(File file) {
// 获取HTML文件流
StringBuffer htmlSb = new StringBuffer();
try {
BufferedReader br = new BufferedReader(new InputStreamReader(
new FileInputStream(file), "unicode"));
while (br.ready()) {
htmlSb.append(br.readLine());
}
br.close();
// 删除临时文件
//file.delete();
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
// HTML文件字符串
String htmlStr = htmlSb.toString();
// 返回经过清洁的html文本
return htmlStr;
}

public static void main(String[] args){

String filePath = "D:\\test\\test.html";
//读取.html文件为字符串
String htmlStr = toHtmlString(new File(filePath));
//解析字符串为Document对象
Document doc = Jsoup.parse(htmlStr);
//获取body元素，获取class="fc"的table元素
Elements table = doc.body().getElementsByClass("fc");
//获取tbody元素
Elements children = table.first().children();
//获取tr元素集合
Elements tr = children.get(0).getElementsByTag("tr");
//遍历tr元素，获取td元素，并打印
for(int i=0; i<tr.size(); i++){
Element e1 = tr.get(i);
Elements td = e1.getElementsByTag("td");
for(int j=0; j<td.size(); j++){
String value = td.get(j).text();
System.out.print(" "+value);
}
System.out.println();
}

}