InputStream inputStream = dto.getFile().getInputStream();
ZipInputStream zipInputStream = new ZipInputStream(inputStream);
ZipEntry entry;
String pageText ="";
//解压缩
while ((entry = zipInputStream.getNextEntry()) != null) {
if (!entry.isDirectory() && entry.getName().equals("report.html")) {
// 从ZipInputStream读取数据
byte[] buffer = new byte[1024];
StringBuilder stringBuilder = new StringBuilder();
int length;
while ((length = zipInputStream.read(buffer)) > 0) {
stringBuilder.append(new String(buffer, 0, length));
}
// 解析HTML文件
String htmlContent = stringBuilder.toString();
Document document = Jsoup.parse(htmlContent);
pageText = extractTextFromPage(document.body());
break; // 找到report.html后退出循环
}
}
if (pageText.length()==0){
throw new RuntimeException("请正确传递压缩文件!");
}
private static String extractTextFromPage(Node node) {
StringBuilder pageText = new StringBuilder();
// 提取当前节点的文本内容
if (node instanceof TextNode) {
pageText.append(((TextNode) node).text()).append(" ");
} else if (node instanceof Element) {
Element element = (Element) node;
// 如果是段落或者表格单元格,加上换行符
if (element.tagName().equals("tr")) {
pageText.append("\n");
}
// 处理当前元素的子节点
for (Node child : element.childNodes()) {
pageText.append(extractTextFromPage(child));
}
}
return pageText.toString();
}
解压缩并按照HTML原本的格式获取文本内容
最新推荐文章于 2024-09-29 09:14:30 发布