packagecom.example.demo.read;importorg.apache.poi.POIXMLDocument;importorg.apache.poi.POIXMLTextExtractor;importorg.apache.poi.hslf.extractor.PowerPointExtractor;importorg.apache.poi.hssf.usermodel.HSSFCell;importorg.apache.pdfbox.pdmodel.PDDocument;importorg.apache.poi.hwpf.extractor.WordExtractor;importorg.apache.poi.openxml4j.opc.OPCPackage;importorg.apache.poi.hssf.usermodel.HSSFWorkbook;importorg.apache.poi.ss.usermodel.Cell;importorg.apache.poi.ss.usermodel.Row;importorg.apache.poi.ss.usermodel.Sheet;importorg.apache.poi.ss.usermodel.Workbook;importorg.apache.poi.xslf.usermodel.XMLSlideShow;importorg.apache.poi.xslf.usermodel.XSLFSlide;importorg.apache.poi.xssf.usermodel.XSSFWorkbook;importorg.apache.poi.ss.usermodel.Workbook;importjavax.servlet.ServletException;importjavax.servlet.annotation.WebServlet;importjavax.servlet.http.HttpServlet;importjavax.servlet.http.HttpServletRequest;importjavax.servlet.http.HttpServletResponse;importorg.apache.poi.xwpf.extractor.XWPFWordExtractor;importorg.apache.poi.xwpf.usermodel.XWPFDocument;importorg.openxmlformats.schemas.drawingml.x2006.main.CTRegularTextRun;importorg.openxmlformats.schemas.drawingml.x2006.main.CTTextBody;importorg.openxmlformats.schemas.drawingml.x2006.main.CTTextParagraph;importorg.openxmlformats.schemas.presentationml.x2006.main.CTGroupShape;importorg.openxmlformats.schemas.presentationml.x2006.main.CTShape;importorg.openxmlformats.schemas.presentationml.x2006.main.CTSlide;importorg.apache.pdfbox.text.PDFTextStripper;importorg.apache.pdfbox.pdmodel.PDDocument;importjava.io.BufferedInputStream;importjava.io.BufferedReader;importjava.io.File;importjava.io.FileInputStream;importjava.io.FileNotFoundException;importjava.io.FileReader;importjava.io.InputStream;importjava.io.InputStreamReader;importjava.text.NumberFormat;importjava.util.ArrayList;importjava.util.List;public classDocRead {/*** @Description: POI 读取 word
* @create: 2019-07-27 9:48
* @update logs
*@throwsException*/
private static int maxx = 10000;//判断编码格式方法
private staticString get_code(File sourceFile) {
String charset= "GBK";byte[] first3Bytes = new byte[3];try{boolean checked = false;
BufferedInputStream bis= new BufferedInputStream(newFileInputStream(sourceFile));
bis.mark(0);int read = bis.read(first3Bytes, 0, 3);if (read == -1) {
bis.close();return charset; //文件编码为 ANSI
} else if (first3Bytes[0] == (byte) 0xFF
&& first3Bytes[1] == (byte) 0xFE) {
charset= "UTF-16LE"; //文件编码为 Unicode
checked = true;
}else if (first3Bytes[0] == (byte) 0xFE
&& first3Bytes[1] == (byte) 0xFF) {
charset= "UTF-16BE"; //文件编码为 Unicode big endian
checked = true;
}else if (first3Bytes[0] == (byte) 0xEF
&& first3Bytes[1] == (byte) 0xBB
&& first3Bytes[2] == (byte) 0xBF) {
charset= "UTF-8"; //文件编码为 UTF-8
checked = true;
}
bis.reset();if (!checked) {int loc = 0;while ((read = bis.read()) != -1) {
loc++;if (read >= 0xF0)break;if (0x80 <= read && read <= 0xBF) //单独出现BF以下的,也算是GBK
break;if (0xC0 <= read && read <= 0xDF) {
read=bis.read();if (0x80 <= read && read <= 0xBF) //双字节 (0xC0 - 0xDF)//(0x80//- 0xBF),也可能在GB编码内
continue;else
break;
}else if (0xE0 <= read && read <= 0xEF) {//也有可能出错,但是几率较小
read =bis.read();if (0x80 <= read && read <= 0xBF) {
read=bis.read();if (0x80 <= read && read <= 0xBF) {
charset= "UTF-8";break;
}else
break;
}else
break;
}
}
}
bis.close();
}catch(Exception e) {
e.printStackTrace();
}returncharset;
}//@SuppressWarnings("resource")
public static String readWord(String filePath) throwsException{if(filePath.equals("")) return null;//List linList = new ArrayList();
String buffer = "";try{if (filePath.endsWith(".doc")) {
InputStream fis= new FileInputStream(newFile(filePath));
WordExtractor ex= newWordExtractor(fis);
buffer=ex.getText();
fis.close();
ex.close();
}else if (filePath.endsWith(".docx")) {
FileInputStream fis= newFileInputStream(filePath);
XWPFDocument xdoc= newXWPFDocument(fis);
XWPFWordExtractor ex= newXWPFWordExtractor(xdoc);
buffer=ex.getText();
ex.close();
fis.close();
xdoc.close();//return buffer;
}else if(filePath.endsWith(".pdf")) {
PDDocument ex;
InputStream fis= new FileInputStream(newFile(filePath));
ex=PDDocument.load(fis);
PDFTextStripper stripper= newPDFTextStripper();
buffer=stripper.getText(ex);
fis.close();
ex.close();
}else if(filePath.endsWith(".txt")) {
File file= newFile(filePath);
String code=get_code(file);
System.out.println("code: " +code);//code = "UTF-8";
InputStream is = newFileInputStream(file);
InputStreamReader isr= newInputStreamReader(is, code);
BufferedReader fis= newBufferedReader(isr);
String linetxt= null;//result用来存储文件内容
StringBuilder sb = newStringBuilder();//按使用readLine方法,一次读一行
while ((linetxt = fis.readLine()) != null && sb.length()
System.out.println(linetxt);
sb.append(linetxt);
sb.append(" ");
}
is.close();
isr.close();
fis.close();
buffer=sb.toString();//System.out.println("tex\n" + buffer);
}else if(filePath.endsWith("xls") || filePath.endsWith("xlsx")) {
StringBuilder sb= newStringBuilder();
FileInputStream fis= newFileInputStream(filePath);
Workbook wb= null; //Workbook 不能close 关闭fis即可
if(filePath.endsWith("xsl")) {
wb= newHSSFWorkbook(fis);
}else{
wb= newXSSFWorkbook(fis);
}for(int sheetIndex = 0; sheetIndex < wb.getNumberOfSheets() && sb.length() < maxx; sheetIndex++) {
Sheet sheet= wb.getSheetAt(sheetIndex); //读取sheet 0
int firstRowIndex = sheet.getFirstRowNum(); //设置变量的第一行
int lastRowIndex = sheet.getLastRowNum(); //设置变量的行
System.out.println("firstRowIndex: "+firstRowIndex);
System.out.println("lastRowIndex: "+lastRowIndex);for(int rIndex = firstRowIndex; rIndex <= lastRowIndex && sb.length() < maxx; rIndex++) { //遍历行
System.out.println("rIndex: " +rIndex);
Row row=sheet.getRow(rIndex);if (row != null) {int firstCellIndex =row.getFirstCellNum();int lastCellIndex =row.getLastCellNum();
System.out.println("1c: " + firstCellIndex + "lc: " +lastCellIndex);for (int cIndex = firstCellIndex; cIndex < lastCellIndex && sb.length() < maxx; cIndex++) { //遍历列
Cell cell =row.getCell(cIndex);
System.out.println(cell);if (cell != null) {
sb.append(cell.toString());
sb.append(" ");//System.out.println(cell.toString());
}
}
}
}
}
fis.close();
buffer=sb.toString();
}else if(filePath.endsWith("ppt")) {
FileInputStream fis= new FileInputStream(newFile(filePath));
PowerPointExtractor ex=newPowerPointExtractor(fis);
buffer=ex.getText();
fis.close();
ex.close();
}else if(filePath.endsWith("pptx")) {
StringBuilder sb= newStringBuilder();
FileInputStream fis= newFileInputStream(filePath);
XMLSlideShow xmlSlideShow= newXMLSlideShow(fis);
List slides =xmlSlideShow.getSlides();for(XSLFSlide slide:slides){
CTSlide rawSlide=slide.getXmlObject();
CTGroupShape gs=rawSlide.getCSld().getSpTree();
CTShape[] shapes=gs.getSpArray();for(CTShape shape:shapes){
CTTextBody tb=shape.getTxBody();if(null==tb){continue;
}
CTTextParagraph[] paras=tb.getPArray();for(CTTextParagraph textParagraph:paras){
CTRegularTextRun[] textRuns=textParagraph.getRArray();for(CTRegularTextRun textRun:textRuns){
sb.append(textRun.getT()+ " ");
}
}
}
}
buffer=sb.toString();
xmlSlideShow.close();
fis.close();
}else{return null;
}
buffer= buffer.replace("\n|\r", " ");//buffer = buffer.replace("'", " ");
if(buffer.length() > maxx) buffer = buffer.substring(0,maxx);returnbuffer;
}catch(Exception e) {
System.out.print("error---->"+filePath);
e.printStackTrace();return null;
}
}
}