java 判断文档类型,获取不同的文档内容 doc、xls、ppt、pdf、txt

import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.poi.hslf.extractor.PowerPointExtractor;
import org.apache.poi.hssf.extractor.ExcelExtractor;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor;
import org.apache.poi.xslf.usermodel.XMLSlideShow;
import org.apache.poi.xssf.extractor.XSSFExcelExtractor;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
import org.apache.poi.xwpf.usermodel.XWPFDocument;

import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.UnsupportedEncodingException;

public class ParseText {
    // 判断文档类型,调用不同的解析方法
    public static String parse(byte[] buffer, String suffix) {
        String text = "";
        switch (suffix) {
            case "doc":
                text = getTextFromWord(buffer);
                break;
            case "docx":
                text = getTextFromWord2007(buffer);
                break;
            case "xls":
                text = getTextFromExcel(buffer);
                break;
            case "xlsx":
                text = getTextFromExcel2007(buffer);
                break;
            case "ppt":
                text = getTextFromPPT(buffer);
                break;
            case "pptx":
                text = getTextFromPPT2007(buffer);
                break;
            case "pdf":
                text = getTextFormPDF(buffer);
                break;
            case "txt":
                text = getTextFormTxt(buffer);
                break;
            default:
                System.out.println("不支持解析的文档类型");
        }

        return text.replaceAll("\\s*", "");
    }

    // 读取Word97-2003的全部内容 doc
    private static String getTextFromWord(byte[] file) {
        String text = "";
        InputStream fis = null;
        WordExtractor ex = null;
        try {
            // word 2003: 图片不会被读取
            fis = new ByteArrayInputStream(file);
            ex = new WordExtractor(fis);
            text = ex.getText();
            ex.close();
        } catch (Exception e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
        return text;
    }

    // 读取Word2007+的全部内容 docx
    private static String getTextFromWord2007(byte[] file) {
        String text = "";
        InputStream fis = null;
        XWPFDocument doc = null;
        XWPFWordExtractor workbook = null;
        try {
            fis = new ByteArrayInputStream(file);
            doc = new XWPFDocument(fis);
            workbook = new XWPFWordExtractor(doc);
            text = workbook.getText();
            workbook.close();
        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
        return text;
    }

    // 读取Excel97-2003的全部内容 xls
    private static String getTextFromExcel(byte[] file) {
        InputStream is = null;
        HSSFWorkbook wb = null;
        String text = "";
        try {
            is = new ByteArrayInputStream(file);
            wb = new HSSFWorkbook(new POIFSFileSystem(is));
            ExcelExtractor extractor = new ExcelExtractor(wb);
            extractor.setFormulasNotResults(false);
            extractor.setIncludeSheetNames(false);
            text = extractor.getText();
            extractor.close();
        } catch (IOException e) {
            e.printStackTrace();
        }
        return text;
    }

    // 读取Excel2007+的全部内容 xlsx
    private static String getTextFromExcel2007(byte[] file) {
        InputStream is = null;
        XSSFWorkbook workBook = null;
        String text = "";
        try {
            is = new ByteArrayInputStream(file);
            workBook = new XSSFWorkbook(is);
            XSSFExcelExtractor extractor = new XSSFExcelExtractor(workBook);
            extractor.setIncludeSheetNames(false);
            text = extractor.getText();
            extractor.close();
        } catch (IOException e) {
            e.printStackTrace();
        }
        return text;
    }

    // 读取Powerpoint97-2003的全部内容 ppt
    private static String getTextFromPPT(byte[] file) {
        String text = "";
        InputStream fis = null;
        PowerPointExtractor ex = null;
        try {
            // word 2003: 图片不会被读取
            fis = new ByteArrayInputStream(file);
            ex = new PowerPointExtractor(fis);
            text = ex.getText();
            ex.close();
        } catch (Exception e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
        return text;
    }

    // 抽取幻灯片2007+全部内容 pptx
    private static String getTextFromPPT2007(byte[] file) {
        InputStream is = null;
        XMLSlideShow slide = null;
        String text = "";
        try {
            is = new ByteArrayInputStream(file);
            slide = new XMLSlideShow(is);
            XSLFPowerPointExtractor extractor = new XSLFPowerPointExtractor(slide);
            text = extractor.getText();
            extractor.close();
        } catch (IOException e) {
            e.printStackTrace();
        }
        return text;
    }

    // 读取pdf文件全部内容 pdf
    private static String getTextFormPDF(byte[] file) {
        String text = "";
        PDDocument pdfdoc = null;
        InputStream is = null;
        try {
            is = new ByteArrayInputStream(file);
            pdfdoc = PDDocument.load(is);
            PDFTextStripper stripper = new PDFTextStripper();
            text = stripper.getText(pdfdoc);

        } catch (IOException e) {
            e.printStackTrace();
        } finally {
            try {
                if (pdfdoc != null) {
                    pdfdoc.close();
                }
            } catch (IOException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            }
        }
        return text;
    }

    // 获得txt文件编码方式
    private static String get_charset(byte[] file) throws IOException {
        String charset = "GBK";
        byte[] first3Bytes = new byte[3];
        InputStream bis = null;
        try {
            boolean checked = false;
            bis = new ByteArrayInputStream(file);
            bis.mark(0);
            int read = bis.read(first3Bytes, 0, 3);
            if (read == -1)
                return charset;
            if (first3Bytes[0] == (byte) 0xFF && first3Bytes[1] == (byte) 0xFE) {
                charset = "UTF-16LE";
                checked = true;
            } else if (first3Bytes[0] == (byte) 0xFE && first3Bytes[1] == (byte) 0xFF) {
                charset = "UTF-16BE";
                checked = true;
            } else if (first3Bytes[0] == (byte) 0xEF && first3Bytes[1] == (byte) 0xBB
                    && first3Bytes[2] == (byte) 0xBF) {
                charset = "UTF-8";
                checked = true;
            }
            bis.reset();
            if (!checked) {
                while ((read = bis.read()) != -1) {
                    if (read >= 0xF0)
                        break;
                    if (0x80 <= read && read <= 0xBF) // 单独出现BF以下的,也算是GBK
                        break;
                    if (0xC0 <= read && read <= 0xDF) {
                        read = bis.read();
                        if (0x80 <= read && read <= 0xBF) // 双字节 (0xC0 - 0xDF)
                            // (0x80 - 0xBF),也可能在GB编码内
                            continue;
                        else
                            break;
                    } else if (0xE0 <= read && read <= 0xEF) {// 也有可能出错,但是几率较小
                        read = bis.read();
                        if (0x80 <= read && read <= 0xBF) {
                            read = bis.read();
                            if (0x80 <= read && read <= 0xBF) {
                                charset = "UTF-8";
                                break;
                            } else
                                break;
                        } else
                            break;
                    }
                }
            }
        } catch (Exception e) {
            e.printStackTrace();
        } finally {
            if (bis != null) {
                bis.close();
            }
        }
        return charset;
    }

    // 读取txt文件全部内容 txt
    private static String getTextFormTxt(byte[] file) {
        String text = "";
        try {
            String encoding = get_charset(file);
            text = new String(file, encoding);
        } catch (UnsupportedEncodingException e) {
            e.printStackTrace();
        } catch (IOException e1) {
            e1.printStackTrace();
        }
        return text;
    }
}




  • 10
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论
好的,以下是一个可以获取Word、PPTPDF、Excel等文档页数的Java工具类,传入参数为MultipartFile类型的示例代码: ```java import org.apache.poi.hwpf.HWPFDocument; import org.apache.poi.hwpf.usermodel.Range; import org.apache.poi.hssf.usermodel.HSSFWorkbook; import org.apache.poi.ss.usermodel.Workbook; import org.apache.poi.xslf.usermodel.XMLSlideShow; import org.apache.poi.xssf.usermodel.XSSFWorkbook; import org.apache.poi.xwpf.usermodel.XWPFDocument; import java.io.IOException; import java.io.InputStream; public class DocumentPageCountUtil { /** * 获取Word文档页数 * * @param file MultipartFile类型的Word文档 * @return Word文档页数 * @throws IOException */ public static int getWordPageCount(MultipartFile file) throws IOException { InputStream inputStream = file.getInputStream(); HWPFDocument document = new HWPFDocument(inputStream); Range range = document.getRange(); int pageCount = range.numParagraphs() / 28 + 1; return pageCount; } /** * 获取PPT文档页数 * * @param file MultipartFile类型PPT文档 * @return PPT文档页数 * @throws IOException */ public static int getPptPageCount(MultipartFile file) throws IOException { InputStream inputStream = file.getInputStream(); XMLSlideShow ppt = new XMLSlideShow(inputStream); int pageCount = ppt.getSlides().size(); return pageCount; } /** * 获取PDF文档页数 * * @param file MultipartFile类型PDF文档 * @return PDF文档页数 * @throws IOException */ public static int getPdfPageCount(MultipartFile file) throws IOException { InputStream inputStream = file.getInputStream(); PDDocument document = PDDocument.load(inputStream); int pageCount = document.getNumberOfPages(); return pageCount; } /** * 获取Excel文档页数 * * @param file MultipartFile类型的Excel文档 * @return Excel文档页数 * @throws IOException */ public static int getExcelPageCount(MultipartFile file) throws IOException { InputStream inputStream = file.getInputStream(); Workbook workbook; if (file.getOriginalFilename().endsWith(".xls")) { workbook = new HSSFWorkbook(inputStream); } else { workbook = new XSSFWorkbook(inputStream); } int pageCount = workbook.getNumberOfSheets(); return pageCount; } /** * 获取文档页数 * * @param file MultipartFile类型文档 * @return 文档页数 * @throws IOException */ public static int getDocumentPageCount(MultipartFile file) throws IOException { String originalFilename = file.getOriginalFilename(); if (originalFilename.endsWith(".doc")) { return getWordPageCount(file); } else if (originalFilename.endsWith(".docx")) { return getWordPageCount(file); } else if (originalFilename.endsWith(".ppt")) { return getPptPageCount(file); } else if (originalFilename.endsWith(".pptx")) { return getPptPageCount(file); } else if (originalFilename.endsWith(".pdf")) { return getPdfPageCount(file); } else if (originalFilename.endsWith(".xls")) { return getExcelPageCount(file); } else if (originalFilename.endsWith(".xlsx")) { return getExcelPageCount(file); } else { throw new IOException("文件格式不支持"); } } } ``` 这个工具类中,我们使用了Apache POI和Apache PDFBox这两个开源库来处理文档。其中,getWordPageCount方法用于获取Word文档页数,getPptPageCount方法用于获取PPT文档页数,getPdfPageCount方法用于获取PDF文档页数,getExcelPageCount方法用于获取Excel文档页数,getDocumentPageCount方法用于根据传入的文件类型调用相应的方法获取文档页数。

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

南大白

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值