文档解析工具类
可能用到的maven依赖
<dependencies>
<dependency>
<groupId>com.itextpdf</groupId>
<artifactId>itextpdf</artifactId>
<version>5.0.6</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.poi/poi -->
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi</artifactId>
<version>4.1.2</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.poi/poi-ooxml -->
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-ooxml</artifactId>
<version>4.1.2</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.poi/poi-scratchpad -->
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-scratchpad</artifactId>
<version>4.1.2</version>
</dependency>
</dependencies>
数据填充word模板(‘{}’占位符填充)
/**
* 将数据填充到word文档中
*
* @param templateDoc
* @param data
* @param <T>
* @return
*/
public static <T> XWPFDocument fillTemplate(XWPFDocument templateDoc, T data) {
Field[] fields = data.getClass().getDeclaredFields();
for (Field field : fields) {
field.setAccessible(true); // 设置属性可访问
}
// 替换段落中的占位符
for (XWPFParagraph paragraph : templateDoc.getParagraphs()) {
replacePlaceholder(paragraph, fields, data);
}
// 替换表格中的占位符
for (XWPFTable table : templateDoc.getTables()) {
for (XWPFTableRow row : table.getRows()) {
for (XWPFTableCell cell : row.getTableCells()) {
replacePlaceholderInCell(cell, fields, data);
}
}
}
return templateDoc;
}
// 替换段落中的占位符
private static <T> void replacePlaceholder(XWPFParagraph paragraph, Field[] fields, T data) {
for (XWPFRun run : paragraph.getRuns()) {
String text = run.getText(0);
if (text != null && text.contains("{")) {
for (Field field : fields) {
String fieldName = field.getName();
if (text.contains("{" + fieldName + "}")) {
try {
Object value = field.get(data);
text = text.replace("{" + fieldName + "}", value != null ? value.toString() : " ");
run.setText(text, 0);
} catch (IllegalAccessException e) {
log.error("生成底稿失败: " + e.getMessage());
}
}
}
}
}
}
// 替换单元格中的占位符
private static <T> void replacePlaceholderInCell(XWPFTableCell cell, Field[] fields, T data) {
for (XWPFParagraph paragraph : cell.getParagraphs()) {
for (XWPFRun run : paragraph.getRuns()) {
String text = run.getText(0);
if (text != null && text.contains("{") && text.contains("}")) {
for (Field field : fields) {
String fieldName = field.getName();
if (text.contains("{" + fieldName + "}")) {
try {
Object value = field.get(data);
String replacedText = text.replace("{" + fieldName + "}", value != null ? value.toString() : "");
// 处理换行符
if (replacedText.contains("\n")) {
handleTextWithLineBreaks(run, replacedText);
} else {
run.setText(replacedText, 0);
}
} catch (IllegalAccessException e) {
log.error("Error accessing field: " + e.getMessage());
}
}
}
}
}
}
}
// 处理包含换行符的文本,将其分行显示
private static void handleTextWithLineBreaks(XWPFRun run, String text) {
// 分割文本,处理 \n 或 \r\n 换行符
String[] lines = text.split("\\r?\\n");
// 清空已有的内容
run.setText("", 0); // 清除当前run中的文本
// 添加新文本和换行符
for (int i = 0; i < lines.length; i++) {
if (i > 0) {
run.addBreak(); // 在每一行之后插入换行符
}
run.setText(lines[i], i == 0 ? 0 : run.getTextPosition());
}
}
读取不同格式文件的页数
/**
* 将文档转为pdf 返回文件页数
* @param docUrl
* @return
*/
public static int convertDocToPdfAndGetPageCount(String docUrl) {
try {
// 从在线地址下载文档
HttpURLConnection connection = (HttpURLConnection) new URL(docUrl).openConnection();
connection.setRequestMethod("GET");
InputStream inputStream = connection.getInputStream();
// 获取文件名和后缀
String fileName = new File(docUrl).getName();
String fileExtension = fileName.substring(fileName.lastIndexOf(".") + 1).toLowerCase();
// 根据文件后缀执行不同的处理方式
switch (fileExtension) {
case "pdf":
// 如果文件已经是 PDF 格式,直接计算页数
return countPdfPage(inputStream);
case "docx": {
// 如果是 docx 文件,使用 XWPFDocument 读取并转换为 PDF
return countWord2007Page(inputStream);
}
case "doc": {
// 如果是 doc 文件,使用 HWPFDocument 读取并转换为 PDF
return countWord2003Page(inputStream);
}
case "ppt": {
return countPPTPage(inputStream);
}
case "pptx": {
return countPPTXPage(inputStream);
}
default:
//其他格式当1页处理
return 1;
}
} catch (Exception e) {
log.error("解析文档错误:{}",e.getMessage());
return 1;
}
}
/**
* 计算PDF格式文档的页数
*/
public static int countPdfPage(InputStream fileInputStream) {
int pageCount = 0;
PdfReader reader = null;
try {
reader = new PdfReader(fileInputStream);
pageCount = reader.getNumberOfPages();
} catch (IOException e) {
e.printStackTrace();
} finally {
reader.close();
}
return pageCount;
}
/**
* 计算PPTX格式文档的页数
* @param fileInputStream
* @return
* @throws IOException
*/
public static int countPPTPage(InputStream fileInputStream) throws IOException {
int pageCount = 0;
ZipSecureFile.setMinInflateRatio(-1.0d);
HSLFSlideShow hslfSlideShow = new HSLFSlideShow(fileInputStream);
try {
pageCount = hslfSlideShow.getSlides().size();
} catch (Exception e) {
e.printStackTrace();
} finally {
fileInputStream.close();
}
return pageCount;
}
/**
* 计算PPTX格式文档的页数
*/
public static int countPPTXPage(InputStream fileInputStream) throws IOException {
int pageCount = 0;
ZipSecureFile.setMinInflateRatio(-1.0d);
try {
XMLSlideShow pptxFile = new XMLSlideShow(fileInputStream);
pageCount = pptxFile.getSlides().size();
} catch (IOException e) {
e.printStackTrace();
} finally {
fileInputStream.close();
}
return pageCount;
}
/**
* 计算WORD2007(*.docx)格式文档的页数
*/
public static int countWord2007Page(InputStream fileInputStream) throws IOException {
int pageCount = 0;
ZipSecureFile.setMinInflateRatio(-1.0d);
XWPFDocument docx = null;
try {
docx = new XWPFDocument(fileInputStream);
pageCount = docx.getProperties().getExtendedProperties().getUnderlyingProperties().getPages();//总页数
} catch (IOException e) {
e.printStackTrace();
} finally {
docx.close();
}
return pageCount;
}
/**
* 计算WORD2003(*.doc)格式文档的页数
*/
public static int countWord2003Page(InputStream fileInputStream) throws IOException {
int pageCount = 0;
WordExtractor doc = null;
ZipSecureFile.setMinInflateRatio(-1.0d);
try {
doc = new WordExtractor(fileInputStream);//.doc格式Word文件提取器
pageCount = doc.getSummaryInformation().getPageCount();//总页数
} catch (IOException e) {
e.printStackTrace();
} finally {
doc.close();
}
return pageCount;
}
持续更新。。。。