只能抽取文本
。
1.导入依赖
<!--读取word,ppt,excel文件-->
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-scratchpad</artifactId>
<version>3.8</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi</artifactId>
<version>3.8</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-ooxml</artifactId>
<version>3.8</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-ooxml-schemas</artifactId>
<version>3.8</version>
</dependency>
<dependency>
<groupId>org.apache.xmlbeans</groupId>
<artifactId>xmlbeans</artifactId>
<version>2.6.0</version>
</dependency>
<dependency>
<groupId>dom4j</groupId>
<artifactId>dom4j</artifactId>
<version>1.6.1</version>
</dependency>
<!--读取pdf-->
<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>pdfbox</artifactId>
<version>2.0.2</version>
</dependency>
2.具体代码
public class TestDemo {
public static void main(String[] args) throws IOException, OpenXML4JException, XmlException, BadLocationException {
File file = new File("E://merge//");
File[] files = file.listFiles();
//判断文件夹是否存在文件
if (null == files || files.length == 0) {
System.out.println("文件夹是空的");
return;
} else {
FileInputStream fin =null;
for (File file1 : files) {
String name = file1.getName();
if (name.endsWith(".doc")) {
fin = new FileInputStream(file1);
String content1 = readDoc1(fin);
fin.close();
} else if (name.endsWith(".docx")) {
String content2 = readDoc2(file1.getPath());
} else if (name.endsWith(".rtf")) {
fin= new FileInputStream(file1);
String content3 = readDoc3(fin);
fin.close();
} else if (name.endsWith(".ppt")) {
fin = new FileInputStream(file1);
String content4 = readPpt1(fin);
fin.close();
} else if (name.endsWith(".pptx")) {
fin = new FileInputStream(file1);
String content5 = readPpt2(fin);
fin.close();
} else if (name.endsWith(".xls")) {
fin = new FileInputStream(file1);
String content6 = readExcel1(fin);
fin.close();
} else if (name.endsWith(".xlsx")) {
fin = new FileInputStream(file1);
String content7 = readExcel2(fin);
fin.close();
} else if (name.endsWith(".pdf")) {
String content8 = readPdf(file1);
}
}
}
}
//抽取word,.doc结尾
public static String readDoc1(InputStream is) throws IOException {
WordExtractor extractor = new WordExtractor(is);
return extractor.getText();
}
//抽取word,.docx结尾
public static String readDoc2(String path) throws OpenXML4JException, XmlException, IOException {
OPCPackage opcPackage = POIXMLDocument.openPackage(path);
POIXMLTextExtractor extractor = new XWPFWordExtractor(opcPackage);
return extractor.getText();
}
//抽取word,.rtf结尾
public static String readDoc3(InputStream is) throws IOException, BadLocationException {
DefaultStyledDocument document = new DefaultStyledDocument();
new RTFEditorKit().read(is,document,0);
String text = new String(document.getText(0,document.getLength()).getBytes("ISO8859-1"),"GBK");
return text;
}
//抽取ppt,ppt结尾
public static String readPpt1(InputStream is) throws IOException{
PowerPointExtractor extractor=new PowerPointExtractor(is);
return extractor.getText();
}
//抽取ppt,pptx结尾
public static String readPpt2(InputStream is) throws IOException {
StringBuffer buffer = new StringBuffer();
XMLSlideShow xmlSlideShow = new XMLSlideShow(is);
List<XSLFSlide> slides = Arrays.asList(xmlSlideShow.getSlides());
for (XSLFSlide slide : slides) {
CTSlide rawSlide = slide.getXmlObject();
CTGroupShape gs = rawSlide.getCSld().getSpTree();
CTShape[] shapes = gs.getSpArray();
for (CTShape shape : shapes) {
CTTextBody tb = shape.getTxBody();
if (null == tb) {
continue;
}
CTTextParagraph[] paras = tb.getPArray();
for (CTTextParagraph textParagraph : paras) {
CTRegularTextRun[] textRuns = textParagraph.getRArray();
for (CTRegularTextRun textRun : textRuns) {
buffer.append(textRun.getT());
}
}
}
}
String s = buffer.toString();
return s;
}
//抽取excel,xls结尾
public static String readExcel1(InputStream is)throws IOException {
HSSFWorkbook wb = new HSSFWorkbook(new POIFSFileSystem(is));
ExcelExtractor extractor = new ExcelExtractor(wb);
extractor.setFormulasNotResults(false);
extractor.setIncludeSheetNames(true);
return extractor.getText();
}
//抽取Excel,.xlsx结尾
public static String readExcel2(InputStream is) throws IOException {
StringBuffer buffer = new StringBuffer();
XSSFWorkbook wb = new XSSFWorkbook(is);
Sheet sheet = wb.getSheetAt(0);
int firstRowIndex = sheet.getFirstRowNum();
int lastRowIndex = sheet.getLastRowNum();
for(int row=firstRowIndex;row<=lastRowIndex;row++) {
Row row1 = sheet.getRow(row);
if (row1 != null) {
int firstCellIndex = row1.getFirstCellNum();
int lastCellIndex = row1.getLastCellNum();
for (int cIndex = firstCellIndex; cIndex < lastCellIndex; cIndex++) { //遍历列
Cell cell = row1.getCell(cIndex);
if (cell != null) {
buffer.append(cell.toString()+"\t");
}
}
}
}
return buffer.toString();
}
//抽取pdf
public static String readPdf(File file) throws IOException {
PDDocument pd = PDDocument.load(file);
PDFTextStripper stripper = new PDFTextStripper();
return stripper.getText(pd);
}
}
只能抽取文件中的文本内容,图片等不会加载,下图是抽取的ppt和excel只能抽取到文本,具体的实现要结合需求。我做的是消极处理方便查看,在开发中还是要积极处理。
另外:我是新手有任何问题都可提出谢谢