读取解析文件内容

1、所需依赖jar包

<dependency>
            <groupId>org.apache.poi</groupId>
            <artifactId>poi-scratchpad</artifactId>
            <version>4.1.2</version>
        </dependency>
        <!-- https://mvnrepository.com/artifact/com.itextpdf/itext-core -->
        <dependency>
            <groupId>com.itextpdf</groupId>
            <artifactId>itext-core</artifactId>
            <version>8.0.4</version>
            <type>pom</type>
        </dependency>

        <dependency>
            <groupId>org.apache.poi</groupId>
            <artifactId>poi</artifactId>
            <version>4.1.2</version>
        </dependency>
        <dependency>
            <groupId>org.apache.poi</groupId>
            <artifactId>poi-ooxml</artifactId>
            <version>4.1.2</version>
        </dependency>

2、定义解析文件类型枚举类

public enum FileType {

    DOCX("docx", 1),
    DOC("doc", 2),
    PDF("pdf", 3),
    TXT("txt", 4),
    XLS("xls", 5),
    XLSX("xlsx", 6),
    PPT("ppt", 7),
    PPTX("pptx", 8),
    ;
    private final String desc;
    private final int key;

    FileType(String d, int k) {
        desc = d;
        key = k;
    }

    public String getDesc() {
        return desc;
    }
}

3、解析文件

public String getFileContent() throws IOException {
        File file = new File("D:\\test.txt");
        String ext = "txt";
        InputStream inputStream = new FileInputStream(file);
        return asyncExecutorService.getFileInfoContent(inputStream,ext);
    }
/**
     * 获取小文件内容
     * @param ext
     * @return
     * @throws IOException
     */
    @Override
    public String getFileInfoContent(InputStream inputStream, String ext) throws IOException {
        String result = "";
        if (ext.equals(FileType.DOCX.getDesc())){
            result = handlerDocx(inputStream);
        } else if (ext.equals(FileType.DOC.getDesc())) {
            HWPFDocument document = new HWPFDocument(inputStream);
            result = document.getText().toString();
        } else if (ext.equals(FileType.TXT.getDesc())) {
            BufferedReader reader = new BufferedReader(new InputStreamReader(inputStream));
            String line;
            StringBuilder stringBuilder = new StringBuilder();
            while ((line = reader.readLine()) != null) {
                stringBuilder.append(line.trim() + "\n");
            }
            result = stringBuilder.toString();
        } else if (ext.equals(FileType.XLS.getDesc())) {
            try (Workbook workbook = new HSSFWorkbook(inputStream)) {
                result = handlerExcel(workbook);
            }
        } else if (ext.equals(FileType.XLSX.getDesc())) {
            try (Workbook workbook = new XSSFWorkbook(inputStream)) {
                result = handlerExcel(workbook);
            }
        } else if (ext.equals(FileType.PDF.getDesc())) {
            try (PdfReader pdfReader = new PdfReader(inputStream);
                 PdfDocument pdfDocument = new PdfDocument(pdfReader)) {
                // 使用PdfTextExtractor提取文本
                result = PdfTextExtractor.getTextFromPage(pdfDocument.getFirstPage());
            }
        } else if (ext.equals(FileType.PPT.getDesc())) {
            StringBuffer content = new StringBuffer();
            HSLFSlideShow hslfSlideShow = new HSLFSlideShow(inputStream);
            List<HSLFSlide> slides = hslfSlideShow.getSlides();
            SlideShowExtractor slideShowExtractor = new SlideShowExtractor(hslfSlideShow);
            for (HSLFSlide slide : slides) {
                content.append(slideShowExtractor.getText(slide));
            }
            slideShowExtractor.close();
            result = content.toString();
        } else if (ext.equals(FileType.PPTX.getDesc())) {
            result = handlerPPTX(inputStream);
        }
        inputStream.close();
        return result;
    }

    /**
     * 获取docx类型文件内容
     * @param inputStream
     */
    private String handlerDocx(InputStream inputStream) throws IOException {
        XWPFDocument doc = new XWPFDocument(inputStream);
        //获取段落
        StringBuilder result = new StringBuilder();
        for (IBodyElement element : doc.getBodyElements()) {
            //文本类型
            if (element instanceof XWPFParagraph){
                XWPFParagraph paragraph = (XWPFParagraph) element;
                String text = getText(paragraph);
                result.append(text + "\n");
            } else if (element instanceof XWPFTable) {
                XWPFTable table = (XWPFTable) element;
                String text = table.getText();
                result.append(text + "\n");
            }
        }
        return result.toString();
    }

    private static String getText(XWPFParagraph paragraph) {
        //获取行
        List<XWPFRun> runs = paragraph.getRuns();
        //获取行中所有文字
        StringBuilder stringBuilder = new StringBuilder();
        for (XWPFRun run : runs) {
            stringBuilder.append(run.text().trim());
        }
        return stringBuilder.toString();
    }

    /**
     * 获取excel类型文件内容
     * @param workbook
     * @return
     */
    private String handlerExcel(Workbook workbook){
        Sheet sheet = workbook.getSheetAt(0);
        Iterator<Row> rowIterator = sheet.iterator();
        StringBuffer result = new StringBuffer();
        while (rowIterator.hasNext()) {
            Row row = rowIterator.next();
            Iterator<Cell> cellIterator = row.iterator();

            while (cellIterator.hasNext()) {
                Cell cell = cellIterator.next();
                String text = "";
                switch (cell.getCellType()) {
                    case STRING:
                        text = cell.getStringCellValue() + "\t";
                        break;
                    case NUMERIC:
                        if (DateUtil.isCellDateFormatted(cell)) {
                            Date date = cell.getDateCellValue();
                            SimpleDateFormat formatter = new SimpleDateFormat("yyyy-MM-dd");
                            text = formatter.format(date) + "\t";
                        } else {
                            DataFormatter dataFormatter = new DataFormatter();
                            text = dataFormatter.formatCellValue(cell) + "\t";
                        }
                        break;
                    case BOOLEAN:
                        text = cell.getBooleanCellValue() + "\t";
                        break;
                    default:
                        System.out.print("\t");
                }
                result.append(text);
            }
            result.append("\n");
        }
        return result.toString();
    }

    /**
     * 处理PPTX类型文件
     * @param inputStream
     * @return
     */
    private String handlerPPTX(InputStream inputStream) {
        StringBuffer content = new StringBuffer();
        try {
            XMLSlideShow xmlSlideShow = new XMLSlideShow(inputStream);
            List<XSLFSlide> slides = xmlSlideShow.getSlides();
            //遍历幻灯片
            for (XSLFSlide slide : slides) {
                CTSlide rawSlide = slide.getXmlObject();
                CTGroupShape spTree = rawSlide.getCSld().getSpTree();
                List<CTShape> spList = spTree.getSpList();
                for (CTShape shape : spList) {
                    CTTextBody txBody = shape.getTxBody();
                    if (null == txBody) {
                        continue;
                    }
                    List<CTTextParagraph> pList = txBody.getPList();
                    for (CTTextParagraph textParagraph : pList) {
                        List<CTRegularTextRun> textRuns = textParagraph.getRList();
                        for (CTRegularTextRun textRun : textRuns) {
                            content.append(textRun.getT() + "\n");
                        }
                    }
                }
            }
            xmlSlideShow.close();
        } catch (Exception e) {
            e.printStackTrace();
        }
        return content.toString();
    }

评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值