jsoup 获取html中body内容_java读取html文件，并获取body中所有的标签以及内容

最新推荐文章于 2022-03-15 09:59:28 发布

weixin_39765625

最新推荐文章于 2022-03-15 09:59:28 发布

阅读量1k

点赞数

文章标签： jsoup 获取html中body内容

本文链接：https://blog.csdn.net/weixin_39765625/article/details/111814690

版权

该段代码展示了如何使用Java的jsoup库来读取HTML文件，并获取body部分的所有标签及其内容。首先，通过FileInputStream和BufferedReader读取文件，然后搜索<body>标签以开始提取内容。在找到</body>标签之前，将每一行内容添加到body变量中。此外，代码还处理了图片的src属性，将相对路径转换为绝对路径。

摘要由CSDN通过智能技术生成

packagecom.lmt.service.file;importjava.io.BufferedReader;importjava.io.File;importjava.io.FileInputStream;importjava.io.InputStreamReader;importjava.io.Reader;importorg.springframework.stereotype.Component;importcom.lmt.config.UrlConstants;

@Componentpublic classParseFile {/*** 解析html文件

*@paramfile

*@return

publicString readHtml(File file){

String body= "";try{

FileInputStream iStream= newFileInputStream(file);

Reader reader= newInputStreamReader(iStream);

BufferedReader htmlReader= newBufferedReader(reader);

String line;boolean found = false;while (!found && (line = htmlReader.readLine()) != null) {if (line.toLowerCase().indexOf("

的前面可能存在空格

found = true;

}

found= false;while (!found && (line = htmlReader.readLine()) != null) {if (line.toLowerCase().indexOf("

found= true;

}else{//如果存在图片，则将相对路径转换为绝对路径

String lowerCaseLine =line.toLowerCase();if (lowerCaseLine.contains("src")) {//这里是定义图片的访问路径

String directory = "D:/test";//如果路径名不以反斜杠结尾，则手动添加反斜杠

/*if (!directory.endsWith("\\")) {

directory = directory + "\\";

}*/

//line = line.substring(0, lowerCaseLine.indexOf("src") + 5) + directory + line.substring(lowerCaseLine.indexOf("src") + 5);

/*String filename = extractFilename(line);

line = line.substring(0, lowerCaseLine.indexOf("src") + 5) + directory + filename + line.substring(line.indexOf(filename) + filename.length());*/

//如果该行存在多个元素，则分行进行替代

String[] splitLines = line.split("

for (int i = 0; i < splitLines.length; i++) {if (splitLines[i].toLowerCase().startsWith("src")) {

splitLines[i]= splitLines[i].substring(0, splitLines[i].toLowerCase().indexOf("src") + 5)+directory+ splitLines[i].substring(splitLines[i].toLowerCase().indexOf("src") + 5);

}

}//最后进行拼接

line = "";for (int i = 0; i < splitLines.length - 1; i++) { //循环次数要-1，因为最后一个字符串后不需要添加

line = line + splitLines[i] + "

}

line= line + splitLines[splitLines.length - 1];

}

body= body + line + "\n";

}

htmlReader.close();//System.out.println(body);

}catch(Exception e) {

e.printStackTrace();

}returnbody;

}/***

*@paramhtmlLine 一行html片段，包含元素

*@return文件名*/

public staticString extractFilename(String htmlLine) {int srcIndex = htmlLine.toLowerCase().indexOf("src=");if (srcIndex == -1) { //图片不存在，返回空字符串

return "";

}else{

String htmlSrc= htmlLine.substring(srcIndex + 4);char splitChar = '\"'; //默认为双引号，但也有可能为单引号

if (htmlSrc.charAt(0) == '\'') {

splitChar= '\'';

}

String[] firstSplit=htmlSrc.split(String.valueOf(splitChar));

String path= firstSplit[1]; //第0位为空字符串

String[] secondSplit = path.split("[/\\\\]"); //匹配正斜杠或反斜杠

return secondSplit[secondSplit.length - 1];

}

weixin_39765625

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
复制链接

分享到 QQ

分享到新浪微博

扫一扫