Apache POI: A Java library for reading and writing Microsoft Office binary and OOXML file formats.
组件关系
对应关系说明:
POIFS for OLE 2 Documents
一般性使用直接操作抽象的针对特定文档的API即可,另外情况可能需要使用POFIS,更多信息: POIFS project page
HPSF for OLE 2 Document Properties
HPSF处理OLE 2 的属性(title, author, date of last modification etc.);
更多信息:HPSF project page
HSSF and XSSF for Excel Documents
HSSF对应Microsoft Excel 97 (-2003);
XSSF对应Microsoft Excel XML (2007+);
更多信息:HSSF+XSSF project page
HWPF and XWPF for Word Documents
HWPF对应Microsoft Word 97 (-2003),对老版本Word 6和Word 95格式提供简单的文本提取支持;
XWPF对应WordprocessingML (2007+);
更多信息:HWPF project page
HSLF and XSLF for PowerPoint Documents
HSLF对应Microsoft PowerPoint 97(-2003) ;
XSLF对应PresentationML (2007+);
更多信息:HSLF project page
HDGF and XDGF for Visio Documents
HDGF对应Microsoft Visio 97(-2003),目前只支持非常低级别的阅读和简单的文本提取。更对信息:HDGF / Diagram project page
XDGF对应Microsoft Visio XML (.vsdx)。更多信息:XDGF / Diagram project page
HPBF for Publisher Documents
HPBF对应Microsoft Publisher 98(-2007),目前只支持大约一半文件部分的低级别读取,以及简单的文本提取。更对信息:HPBF project page](https://poi.apache.org/components/hpbf/index.html)
HMEF for TNEF (winmail.dat) Outlook Attachments
HMEF对应Microsoft TNEF (Transport Neutral Encoding Format),TNEF有时被Outlook用于对邮件进行编码,通常作为winmail.dat发送。目前只支持低级别的阅读。更多信息:HMEF project page
HSMF for Outlook Messages
HSMF对应Microsoft Outlook message,目前只包含MSG文件的一些文本内容和一些附件。更对信息:HSMF project page
样例
读取普通文本:
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-scratchpad</artifactId>
<version>5.0.0</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-ooxml</artifactId>
<version>5.0.0</version>
</dependency>
Path dir = Paths.get(System.getProperty("user.home"));
@Test
public void readDocText() throws IOException {
try (HWPFDocument doc = new HWPFDocument(new FileInputStream(dir.resolve("test.doc").toFile()))) {
StringBuilder text = new StringBuilder(64);
//main doc
Range main = doc.getRange();
text.append(_extract(main));
//header,footer
Range hfr = doc.getHeaderStoryRange();
text.append(_extract(hfr));
System.out.println(text);
}
}
String _extract(Range range) {
StringBuilder builder = new StringBuilder(64);
int lenParagraph = range.numParagraphs();
for (int x = 0; x < lenParagraph; x++) {
Paragraph p = range.getParagraph(x);
builder.append(p.text()).append("\n");
}
return builder.toString();
}
@Test
public void readDocxText() throws IOException {
try (XWPFDocument docx = new XWPFDocument(new FileInputStream(dir.resolve("test.docx").toFile()))) {
XWPFWordExtractor xwpfWordExtractor = new XWPFWordExtractor(docx);
System.out.println(xwpfWordExtractor.getText());
}
}
@Test
public void readExcelText() throws IOException {
try (HSSFWorkbook hssfWorkbook = new HSSFWorkbook(new FileInputStream(dir.resolve("test.xls").toFile()))) {
StringBuilder text = new StringBuilder(64);
for (Sheet sh : hssfWorkbook) {
HSSFSheet hssfSheet = (HSSFSheet) sh;
//sheet name
text.append(hssfSheet.getSheetName()).append("\n");
// header,footer
text.append(ExcelExtractor._extractHeaderFooter(hssfSheet.getHeader()));
text.append(ExcelExtractor._extractHeaderFooter(hssfSheet.getFooter()));
// Rows and cells
for (Row row : hssfSheet) {
HSSFRow hssfRow = (HSSFRow) row;
for (Iterator<Cell> ri = hssfRow.cellIterator(); ri.hasNext(); ) {
HSSFCell hssfCell = (HSSFCell) ri.next();
text.append(hssfCell.getRichStringCellValue());
//comment
HSSFComment hssfComment = hssfCell.getCellComment();
if (hssfComment != null) {
text.append(hssfComment.getString().getString());
}
}
text.append("\n");
}
}
System.out.println(text);
}
}
@Test
public void readXlsxText() throws IOException {
try (XSSFWorkbook xssfWorkbook = new XSSFWorkbook(new FileInputStream(dir.resolve("test.xlsx").toFile()))) {
XSSFExcelExtractor xssfExcelExtractor = new XSSFExcelExtractor(xssfWorkbook);
System.out.println(xssfExcelExtractor.getText());
}
}
MS文档中插入文档
XWPFDocument:
@Test
void readEmbedDocx() throws IOException, OpenXML4JException {
try (InputStream inputStream = new FileInputStream(dir.resolve("test/testembed.docx").toFile());
XWPFDocument doc = new XWPFDocument(inputStream)) {
List<PackagePart> list = doc.getAllEmbeddedParts();
for (PackagePart packagePart : list) {
String embedName = packagePart.getPartName().getName();
if (embedName.endsWith(".docx")) {
System.out.println(ExtractFileUtils.readDocxText(packagePart.getInputStream()));
}else if(embedName.endsWith(".doc")){
System.out.println(ExtractFileUtils.readDocText(packagePart.getInputStream()));
}else if(embedName.endsWith(".xls")){
System.out.println(ExtractFileUtils.readXlsText(packagePart.getInputStream()));
}else if(embedName.endsWith(".xlsx")){
System.out.println(ExtractFileUtils.readXlsxText(packagePart.getInputStream()));
}
}
}
}
ExtractFileUtils如下:
public class ExtractFileUtils {
public static String readDocText(InputStream inputStream) throws IOException {
try (HWPFDocument doc = new HWPFDocument(inputStream)) {
StringBuilder text = new StringBuilder(64);
//main doc
Range main = doc.getRange();
text.append(_extract(main));
//header,footer
Range hfr = doc.getHeaderStoryRange();
text.append(_extract(hfr));
return text.toString();
}
}
static String _extract(Range range) {
StringBuilder builder = new StringBuilder(64);
int lenParagraph = range.numParagraphs();
for (int x = 0; x < lenParagraph; x++) {
Paragraph p = range.getParagraph(x);
builder.append(p.text()).append("\n");
}
return builder.toString();
}
public static String readDocxText(InputStream inputStream) throws IOException {
try (XWPFDocument docx = new XWPFDocument(inputStream)) {
XWPFWordExtractor xwpfWordExtractor = new XWPFWordExtractor(docx);
StringBuilder text = new StringBuilder(64);
text.append(xwpfWordExtractor.getText());
try {
List<PackagePart> list = docx.getAllEmbeddedParts();
for (PackagePart packagePart : list) {
String embedName = packagePart.getPartName().getName();
if (embedName.endsWith(".docx")) {
text.append(readDocxText(packagePart.getInputStream()));
} else if (embedName.endsWith(".doc")) {
text.append(readDocText(packagePart.getInputStream()));
} else if (embedName.endsWith(".xls")) {
text.append(readXlsText(packagePart.getInputStream()));
} else if (embedName.endsWith(".xlsx")) {
text.append(readXlsxText(packagePart.getInputStream()));
}
}
} catch (OpenXML4JException e) {
throw new IOException(e);
}
return text.toString();
}
}
public static String readXlsText(InputStream inputStream) throws IOException {
try (HSSFWorkbook hssfWorkbook = new HSSFWorkbook(inputStream)) {
StringBuilder text = new StringBuilder(64);
for (Sheet sh : hssfWorkbook) {
HSSFSheet hssfSheet = (HSSFSheet) sh;
//sheet name
text.append(hssfSheet.getSheetName()).append("\n");
// header,footer
text.append(ExcelExtractor._extractHeaderFooter(hssfSheet.getHeader()));
text.append(ExcelExtractor._extractHeaderFooter(hssfSheet.getFooter()));
// Rows and cells
for (Row row : hssfSheet) {
HSSFRow hssfRow = (HSSFRow) row;
for (Iterator<Cell> ri = hssfRow.cellIterator(); ri.hasNext(); ) {
HSSFCell hssfCell = (HSSFCell) ri.next();
text.append(hssfCell.getRichStringCellValue());
//comment
HSSFComment hssfComment = hssfCell.getCellComment();
if (hssfComment != null) {
text.append(hssfComment.getString().getString());
}
}
text.append("\n");
}
}
return text.toString();
}
}
public static String readXlsxText(InputStream inputStream) throws IOException {
try (XSSFWorkbook xssfWorkbook = new XSSFWorkbook(inputStream)) {
XSSFExcelExtractor xssfExcelExtractor = new XSSFExcelExtractor(xssfWorkbook);
return xssfExcelExtractor.getText();
}
}
}