工具类:
public class HtmlUtils {
/**
* 获取HTML代码里的内容
*
* @param htmlStr
* @return
*/
public static String getTextFromHtml(String htmlStr) {
// 去除html标签
htmlStr = delHtmlTags(htmlStr);
// 去除空格" "
/* htmlStr = htmlStr.replaceAll(" ",""); */
return htmlStr;
}
public static String readHtml(String path) {
File file = new File(path);
try (RandomAccessFile accessFile = new RandomAccessFile(file, "r"); FileChannel fcin = accessFile.getChannel();) {
Charset charset = Charset.forName("UTF-8");
int bufSize = 100000;
ByteBuffer rBuffer = ByteBuffer.allocate(bufSize);
String enterStr = "\n";
byte[] bs = new byte[bufSize];
StringBuilder strline = new StringBuilder("");
StringBuilder strBuf = new StringBuilder("");
while (fcin.read(rBuffer) != -1) {
int rSize = rBuffer.position();
rBuffer.rewind();
rBuffer.get(bs);
rBuffer.clear();
String tempString = new String(bs, 0, rSize, charset);
tempString = tempString.replaceAll("\r", "");
int fromIndex = 0;
int endIndex = 0;
while ((endIndex = tempString.indexOf(enterStr, fromIndex)) != -1) {
String line = tempString.substring(fromIndex, endIndex);
line = strBuf.toString() + line;
strline.append(line.trim());
strBuf.delete(0, strBuf.length());
fromIndex = endIndex + 1;
}
if (rSize > tempString.length()) {
strline.append(tempString.substring(fromIndex, tempString.length()));
strBuf.append(tempString.substring(fromIndex, tempString.length()));
} else {
strline.append(tempString.substring(fromIndex, rSize));
strBuf.append(tempString.substring(fromIndex, rSize));
}
}
/* System.out.println(strline.toString().replaceAll("\"", "'")); */
// 读取html内容
String html = StringEscapeUtils.unescapeHtml(strline.toString());
// 提取<body>中的内容,去除标签
String stringHtml = getTextFromHtml(html);
return stringHtml;
} catch (Exception e) {
e.getMessage();
return "";
}
}
/**
* 去除html代码中含有的标签
*
* @param htmlStr
* @return
*/
public static String delHtmlTags(String htmlStr) {
// 定义script的正则表达式,去除js可以防止注入
String scriptRegex = "<script[^>]*?>[\\s\\S]*?<\\/script>";
// 定义style的正则表达式,去除style样式,防止css代码过多时只截取到css样式代码
String styleRegex = "<style[^>]*?>[\\s\\S]*?<\\/style>";
// 去除所有标签,只剩div|br|span|p|
String regexstr = "<(?!div|br|span|p|/p).*?>";
// 定义空格,回车,换行符,制表符
// 过滤script标签
htmlStr = htmlStr.replaceAll(scriptRegex, "");
// 过滤style标签
htmlStr = htmlStr.replaceAll(styleRegex, "");
// 过滤html标签
/* htmlStr = htmlStr.replaceAll(htmlRegex, ""); */
// 过滤空格等
/* htmlStr = htmlStr.replaceAll(spaceRegex, ""); */
htmlStr = htmlStr.replaceAll(regexstr, "");
return htmlStr.trim(); // 返回文本字符串
}
}
测试:
public class Test00 {
public static void main(String[] args) {
String html = HtmlUtils.readHtml("D:\\tess.html" );
System.out.println(html);
}
}
读取的html内容:
控制台打印结果: