java读取word txt pdf_Java 读取Word Excel PDF TXT 文本

最新推荐文章于 2024-05-06 09:01:24 发布

带笑子

最新推荐文章于 2024-05-06 09:01:24 发布

阅读量155

点赞数

文章标签： java读取word txt pdf

本文链接：https://blog.csdn.net/weixin_36463451/article/details/114233074

版权

packagecom.example.demo.read;importorg.apache.poi.POIXMLDocument;importorg.apache.poi.POIXMLTextExtractor;importorg.apache.poi.hslf.extractor.PowerPointExtractor;importorg.apache.poi.hssf.usermodel.HSSFCell;importorg.apache.pdfbox.pdmodel.PDDocument;importorg.apache.poi.hwpf.extractor.WordExtractor;importorg.apache.poi.openxml4j.opc.OPCPackage;importorg.apache.poi.hssf.usermodel.HSSFWorkbook;importorg.apache.poi.ss.usermodel.Cell;importorg.apache.poi.ss.usermodel.Row;importorg.apache.poi.ss.usermodel.Sheet;importorg.apache.poi.ss.usermodel.Workbook;importorg.apache.poi.xslf.usermodel.XMLSlideShow;importorg.apache.poi.xslf.usermodel.XSLFSlide;importorg.apache.poi.xssf.usermodel.XSSFWorkbook;importorg.apache.poi.ss.usermodel.Workbook;importjavax.servlet.ServletException;importjavax.servlet.annotation.WebServlet;importjavax.servlet.http.HttpServlet;importjavax.servlet.http.HttpServletRequest;importjavax.servlet.http.HttpServletResponse;importorg.apache.poi.xwpf.extractor.XWPFWordExtractor;importorg.apache.poi.xwpf.usermodel.XWPFDocument;importorg.openxmlformats.schemas.drawingml.x2006.main.CTRegularTextRun;importorg.openxmlformats.schemas.drawingml.x2006.main.CTTextBody;importorg.openxmlformats.schemas.drawingml.x2006.main.CTTextParagraph;importorg.openxmlformats.schemas.presentationml.x2006.main.CTGroupShape;importorg.openxmlformats.schemas.presentationml.x2006.main.CTShape;importorg.openxmlformats.schemas.presentationml.x2006.main.CTSlide;importorg.apache.pdfbox.text.PDFTextStripper;importorg.apache.pdfbox.pdmodel.PDDocument;importjava.io.BufferedInputStream;importjava.io.BufferedReader;importjava.io.File;importjava.io.FileInputStream;importjava.io.FileNotFoundException;importjava.io.FileReader;importjava.io.InputStream;importjava.io.InputStreamReader;importjava.text.NumberFormat;importjava.util.ArrayList;importjava.util.List;public classDocRead {/*** @Description: POI 读取 word

* @create: 2019-07-27 9:48

* @update logs

*@throwsException*/

private static int maxx = 10000;//判断编码格式方法

private staticString get_code(File sourceFile) {

String charset= "GBK";byte[] first3Bytes = new byte[3];try{boolean checked = false;

BufferedInputStream bis= new BufferedInputStream(newFileInputStream(sourceFile));

bis.mark(0);int read = bis.read(first3Bytes, 0, 3);if (read == -1) {

bis.close();return charset; //文件编码为 ANSI

} else if (first3Bytes[0] == (byte) 0xFF

&& first3Bytes[1] == (byte) 0xFE) {

charset= "UTF-16LE"; //文件编码为 Unicode

checked = true;

}else if (first3Bytes[0] == (byte) 0xFE

&& first3Bytes[1] == (byte) 0xFF) {

charset= "UTF-16BE"; //文件编码为 Unicode big endian

checked = true;

}else if (first3Bytes[0] == (byte) 0xEF

&& first3Bytes[1] == (byte) 0xBB

&& first3Bytes[2] == (byte) 0xBF) {

charset= "UTF-8"; //文件编码为 UTF-8

checked = true;

}

bis.reset();if (!checked) {int loc = 0;while ((read = bis.read()) != -1) {

loc++;if (read >= 0xF0)break;if (0x80 <= read && read <= 0xBF) //单独出现BF以下的，也算是GBK

break;if (0xC0 <= read && read <= 0xDF) {

read=bis.read();if (0x80 <= read && read <= 0xBF) //双字节 (0xC0 - 0xDF)//(0x80//- 0xBF),也可能在GB编码内

continue;else

break;

}else if (0xE0 <= read && read <= 0xEF) {//也有可能出错，但是几率较小

read =bis.read();if (0x80 <= read && read <= 0xBF) {

read=bis.read();if (0x80 <= read && read <= 0xBF) {

charset= "UTF-8";break;

}else

break;

}else

break;

}

bis.close();

}catch(Exception e) {

e.printStackTrace();

}returncharset;

}//@SuppressWarnings("resource")

public static String readWord(String filePath) throwsException{if(filePath.equals("")) return null;//List linList = new ArrayList();

String buffer = "";try{if (filePath.endsWith(".doc")) {

InputStream fis= new FileInputStream(newFile(filePath));

WordExtractor ex= newWordExtractor(fis);

buffer=ex.getText();

fis.close();

ex.close();

}else if (filePath.endsWith(".docx")) {

FileInputStream fis= newFileInputStream(filePath);

XWPFDocument xdoc= newXWPFDocument(fis);

XWPFWordExtractor ex= newXWPFWordExtractor(xdoc);

buffer=ex.getText();

ex.close();

fis.close();

xdoc.close();//return buffer;

}else if(filePath.endsWith(".pdf")) {

PDDocument ex;

InputStream fis= new FileInputStream(newFile(filePath));

ex=PDDocument.load(fis);

PDFTextStripper stripper= newPDFTextStripper();

buffer=stripper.getText(ex);

fis.close();

ex.close();

}else if(filePath.endsWith(".txt")) {

File file= newFile(filePath);

String code=get_code(file);

System.out.println("code: " +code);//code = "UTF-8";

InputStream is = newFileInputStream(file);

InputStreamReader isr= newInputStreamReader(is, code);

BufferedReader fis= newBufferedReader(isr);

String linetxt= null;//result用来存储文件内容

StringBuilder sb = newStringBuilder();//按使用readLine方法，一次读一行

while ((linetxt = fis.readLine()) != null && sb.length()

System.out.println(linetxt);

sb.append(linetxt);

sb.append(" ");

}

is.close();

isr.close();

fis.close();

buffer=sb.toString();//System.out.println("tex\n" + buffer);

}else if(filePath.endsWith("xls") || filePath.endsWith("xlsx")) {

StringBuilder sb= newStringBuilder();

FileInputStream fis= newFileInputStream(filePath);

Workbook wb= null; //Workbook 不能close 关闭fis即可

if(filePath.endsWith("xsl")) {

wb= newHSSFWorkbook(fis);

}else{

wb= newXSSFWorkbook(fis);

}for(int sheetIndex = 0; sheetIndex < wb.getNumberOfSheets() && sb.length() < maxx; sheetIndex++) {

Sheet sheet= wb.getSheetAt(sheetIndex); //读取sheet 0

int firstRowIndex = sheet.getFirstRowNum(); //设置变量的第一行

int lastRowIndex = sheet.getLastRowNum(); //设置变量的行

System.out.println("firstRowIndex: "+firstRowIndex);

System.out.println("lastRowIndex: "+lastRowIndex);for(int rIndex = firstRowIndex; rIndex <= lastRowIndex && sb.length() < maxx; rIndex++) { //遍历行

System.out.println("rIndex: " +rIndex);

Row row=sheet.getRow(rIndex);if (row != null) {int firstCellIndex =row.getFirstCellNum();int lastCellIndex =row.getLastCellNum();

System.out.println("1c: " + firstCellIndex + "lc: " +lastCellIndex);for (int cIndex = firstCellIndex; cIndex < lastCellIndex && sb.length() < maxx; cIndex++) { //遍历列

Cell cell =row.getCell(cIndex);

System.out.println(cell);if (cell != null) {

sb.append(cell.toString());

sb.append(" ");//System.out.println(cell.toString());

}

fis.close();

buffer=sb.toString();

}else if(filePath.endsWith("ppt")) {

FileInputStream fis= new FileInputStream(newFile(filePath));

PowerPointExtractor ex=newPowerPointExtractor(fis);

buffer=ex.getText();

fis.close();

ex.close();

}else if(filePath.endsWith("pptx")) {

StringBuilder sb= newStringBuilder();

FileInputStream fis= newFileInputStream(filePath);

XMLSlideShow xmlSlideShow= newXMLSlideShow(fis);

List slides =xmlSlideShow.getSlides();for(XSLFSlide slide:slides){

CTSlide rawSlide=slide.getXmlObject();

CTGroupShape gs=rawSlide.getCSld().getSpTree();

CTShape[] shapes=gs.getSpArray();for(CTShape shape:shapes){

CTTextBody tb=shape.getTxBody();if(null==tb){continue;

}

CTTextParagraph[] paras=tb.getPArray();for(CTTextParagraph textParagraph:paras){

CTRegularTextRun[] textRuns=textParagraph.getRArray();for(CTRegularTextRun textRun:textRuns){

sb.append(textRun.getT()+ " ");

}

buffer=sb.toString();

xmlSlideShow.close();

fis.close();

}else{return null;

}

buffer= buffer.replace("\n|\r", " ");//buffer = buffer.replace("'", " ");

if(buffer.length() > maxx) buffer = buffer.substring(0,maxx);returnbuffer;

}catch(Exception e) {

System.out.print("error---->"+filePath);

e.printStackTrace();return null;

}

带笑子

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
java读取word txt pdf_Java 读取Word Excel PDF TXT 文本

packagecom.example.demo.read;importorg.apache.poi.POIXMLDocument;importorg.apache.poi.POIXMLTextExtractor;importorg.apache.poi.hslf.extractor.PowerPointExtractor;importorg.apache.poi.hssf.usermodel.HS...
复制链接

扫一扫

java读取word txt pdf_Java 读取Word Excel PDF TXT 文本

“相关推荐”对你有帮助么？