- 前言,接着上一篇(下载(导出)pdf模板文件(比如:审批单));
- 报错原因:html格式有误,结束标签丢失(可能根据实际情况,确认html也是正确的,但是还是会报这个错);
- 解决思路:在拿到html内容时,通过jsoup格式化html内容;
- pom引入jsoup包
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.14.3</version>
</dependency>
- 在java代码中实现html格式化
public class Html2Xhtml {
public static String html2Xhtml(String htmlPath, String xhtmlPath) {
if (StringUtils.isEmpty(htmlPath)) {
return null;
}
String path = null;
try (FileInputStream fin = new FileInputStream(htmlPath)) {
ByteArrayOutputStream byteArrayOut = new ByteArrayOutputStream();
int data = -1;
while ((data = fin.read()) != -1) {
byteArrayOut.write(data);
}
fin.close();
String html = byteArrayOut.toString("UTF-8");
String newHtml = formatHtml(html);
byte[] htmlFileData = newHtml.getBytes(StandardCharsets.UTF_8);
byteArrayOut.close();
ByteArrayInputStream tidyInput = new ByteArrayInputStream(htmlFileData);
ByteArrayOutputStream tidyOut = new ByteArrayOutputStream();
Tidy tidy = new Tidy();
tidy.setInputEncoding("UTF-8");
tidy.setOutputEncoding("UTF-8");
tidy.setShowWarnings(false);
tidy.setIndentContent(true);
tidy.setSmartIndent(true);
tidy.setIndentAttributes(false);
tidy.setMakeClean(true);
tidy.setQuiet(true);
tidy.setWord2000(true);
tidy.setXHTML(true);
tidy.setErrout(new PrintWriter(System.out));
tidy.parse(tidyInput, tidyOut);
tidyInput.close();
tidyOut.writeTo(new FileOutputStream(xhtmlPath));
tidyOut.flush();
tidyOut.close();
path = xhtmlPath;
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
path = null;
}
return path;
}
private static String formatHtml(String html) {
Document document = Jsoup.parse(html);
document.outputSettings().syntax(Document.OutputSettings.Syntax.xml);
document.outputSettings().escapeMode(Entities.EscapeMode.xhtml);
return document.html();
}
}