poi文件操作

最新推荐文章于 2024-06-26 19:21:55 发布

✿ﾟ卡笨卡

最新推荐文章于 2024-06-26 19:21:55 发布

阅读量71

点赞数

分类专栏： java 文章标签： java

本文链接：https://blog.csdn.net/tian__c/article/details/130555377

版权

java 专栏收录该内容

18 篇文章 1 订阅

订阅专栏

该代码示例展示了如何使用Java中的ApachePOI库处理不同类型的MicrosoftOffice文件（如.doc和.docx），以及使用PDFBox处理PDF文件，进行字数统计。此外，还涉及了将不同格式文件转换为文本以计算字数的方法。

摘要由CSDN通过智能技术生成

<dependency>
            <groupId>org.apache.poi</groupId>
            <artifactId>poi</artifactId>
            <version>4.1.2</version>
        </dependency>

        <dependency>
            <groupId>org.apache.poi</groupId>
            <artifactId>poi-excelant</artifactId>
            <version>4.1.2</version>
        </dependency>

        <dependency>
            <groupId>org.apache.poi</groupId>
            <artifactId>poi-ooxml</artifactId>
            <version>4.1.2</version>
        </dependency>

        <dependency>
            <groupId>org.apache.poi</groupId>
            <artifactId>poi-ooxml-schemas</artifactId>
            <version>4.1.2</version>
        </dependency>

        <dependency>
            <groupId>org.apache.poi</groupId>
            <artifactId>poi-scratchpad</artifactId>
            <version>4.1.2</version>
            <scope>compile</scope>
        </dependency>

`获取不同类型文件的字数`


import com.spire.doc.Document;
import com.spire.doc.FileFormat;
import com.spire.pdf.PdfDocument;
import com.spire.pdf.PdfPageBase;
import org.apache.commons.lang.StringUtils;
import org.apache.fop.svg.PDFTextElementBridge;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.pdfbox.text.PDFTextStripperByArea;
import org.apache.poi.hpsf.DocumentSummaryInformation;
import org.apache.poi.hslf.usermodel.*;
import org.apache.poi.hssf.usermodel.HSSFCell;
import org.apache.poi.hssf.usermodel.HSSFRow;
import org.apache.poi.hssf.usermodel.HSSFSheet;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.hwpf.usermodel.HeaderStories;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.poi.ss.usermodel.*;
import org.apache.poi.xslf.usermodel.*;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.apache.poi.xwpf.usermodel.XWPFParagraph;
import org.apache.poi.xwpf.usermodel.XWPFRun;
import org.springframework.web.multipart.MultipartFile;

import java.io.*;
import java.util.*;
import java.util.Objects;

public class DocUtil {


    public static Integer docCount(MultipartFile multipartFile, File remoteFile, String fileFormat){
        Integer num = 0;
        InputStream fileInputStream = null;
        try {
            fileInputStream = FileUtil.fileToStream(multipartFile);
            if(Objects.equals(fileFormat, ".doc")){
                String newFilePath = docToTxt(fileInputStream, FileFormat.Doc, remoteFile.getPath());
                File newFile = new File(newFilePath);
                num = txtCount(newFile);
            }else if(Objects.equals(fileFormat, ".docx")){
                String newFilePath = docToTxt(fileInputStream, FileFormat.Docx, remoteFile.getPath());
                File newFile = new File(newFilePath);
                num = txtCount(newFile);
            }else if(Objects.equals(fileFormat, ".pdf")){
                num = pdfCount(remoteFile);
            }else if(Objects.equals(fileFormat, ".txt")){
                num = txtCount(remoteFile);
            }else{
                num = 0;
            }
        } catch (Exception e) {
            e.printStackTrace();
        } finally {

        }
        return num;
    }

    public static Integer wordCount(String filePath, FileFormat fileFormat){
        Integer countNum = 0;
        Document document = new Document();
        document.loadFromFile(filePath, fileFormat);
        countNum = document.getBuiltinDocumentProperties().getCharCount();//获取字符数（不计空格）
        return countNum;
    }

    public static Integer pdfCount(File file){
        Integer countNum = 0;
        PdfDocument pdfDocument = new PdfDocument();
        pdfDocument.loadFromFile(file.getPath());
        StringBuilder sb = new StringBuilder();
        PdfPageBase page;
        for(int i= 0;i<pdfDocument.getPages().getCount();i++){
            page = pdfDocument.getPages().get(i);
            sb.append(page.extractText(true));
        }
        countNum = sb.length();
        return countNum;
    }


    public static Integer txtCount(File file){
        int charnum=0;//字符数
        int x=-1;
        FileReader fReader = null;
        try {
            fReader = new FileReader(file);
            while((x=fReader.read())!=-1){//按字符读文件，判断，符合则字符加一
                char a=(char)x;
                if(a!='\n'&&a!='\r'){
                    charnum++;
                }
            }
        } catch (Exception e) {
            e.printStackTrace();
        } finally {
            if(fReader != null){
                try {
                    fReader.close();//关闭流
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
        }

        return charnum;//返回结果
    }

    public static String docToTxt(InputStream inputStream, FileFormat fileFormat, String tofile) {
        Document document = new Document();
        document.loadFromStream(inputStream, fileFormat);
        tofile = tofile.split("\\.")[0]+".txt";
        document.saveToFile(tofile, FileFormat.Txt);
        document.close();
        return tofile;
    }

    public static String sectionBreakAndPageBreak(InputStream inputStream, String tofile){
        Document document = new Document();
        document.loadFromStream(inputStream, FileFormat.Docx);

//        //获取第一个节
//        Section section = document.getSections().get(0);
//        //插入分页符到第四个段落
//        Paragraph paragraph = section.getParagraphs().get(3);
//        paragraph.appendBreak(BreakType.Page_Break);
        document.insertTextFromFile("E:\\common_data\\translate\\document\\20221101\\111.docx", FileFormat.Docx_2013);
        tofile = tofile.split("\\.")[0]+"-copy.docx";
        //保存结果文档
        document.saveToFile(tofile, FileFormat.Docx_2013);
        return tofile;
    }


    public static void main(String args[]) throws Exception {
//        BigDecimal wordNumber = new BigDecimal(0);
//        wordNumber = wordNumber.divide(new BigDecimal(2));
//        System.out.println(wordNumber);
        String fileName = "1.ppt";
        FileInputStream input = new FileInputStream("D:\\"+fileName);
        wordCount(FileUtil.streamToFile(input, fileName));

    }

    public static Integer wordCount(MultipartFile file) throws Exception{
        Integer count = 0;
        String fileExt = com.sunther.idb.file.FileUtil.getFileExt(file.getOriginalFilename());
        if(Objects.equals("doc", fileExt)){
            count = getCountByDoc(file.getInputStream());
        } else if(Objects.equals("docx", fileExt)){
            count = getCountByDocx(file.getInputStream());
        } else if(Objects.equals("pdf", fileExt)){
            count = getCountByPdf(file.getInputStream());
        } else if(Objects.equals("txt", fileExt)){
            count = getCountByTxt(file.getInputStream());
        } else if(Objects.equals("xls", fileExt)){
            count = getCountByXls(file.getInputStream());
        } else if(Objects.equals("xlsx", fileExt)){
            count = getCountByXlsx(file.getInputStream());
        } else if(Objects.equals("ppt", fileExt)){
            count = getCountByPPT(file.getInputStream());
        } else if(Objects.equals("pptx", fileExt)){
            count = getCountByPPTX(file.getInputStream());
        }
        System.out.println("文章总字数："+ count);
        return count;
    }


    private static Integer getCountByDoc(InputStream is){
        Integer count = 0;
        POIFSFileSystem fs = null;
        try {
            fs = new POIFSFileSystem(is);
            HWPFDocument doc = new HWPFDocument(fs);
            WordExtractor we = new WordExtractor(doc);
            String[] paragraphs = we.getParagraphText();

            for (int i = 0; i < paragraphs.length; i++) {
                count += paragraphs[i].trim().length();
            }
            is.close();
        } catch (Exception e) {
            e.printStackTrace();
        }
        return count;
    }

    private static Integer getCountByDocx(InputStream is){
        Integer count = 0;
        try {
            XWPFDocument docx = new XWPFDocument(is);
            List<XWPFParagraph> paragraphs = docx.getParagraphs();
            int i = 1;
            for (XWPFParagraph xwpfParagraph : paragraphs) {
                int linLength = 0;
                String lineStr = "";
                List<XWPFRun> xwpfRuns = xwpfParagraph.getRuns();
                for (XWPFRun xwpfRun : xwpfRuns) {
                    linLength +=  xwpfRun.toString().trim().length();
                    lineStr += xwpfRun.toString();
                    count += xwpfRun.toString().trim().length();
                }
                i++;
            }
            is.close();
        } catch (Exception e) {
            e.printStackTrace();
        }
        return count;
    }

    private static Integer getCountByTxt(InputStream is) {
        Integer count = 0;
        try {
            Scanner sc = new Scanner(is);
            while (sc.hasNext()) {
                String temp = sc.nextLine();
                count += temp.trim().length();
            }
            is.close();
        } catch (Exception e) {
            e.printStackTrace();
        }
        return count;
    }

    private static Integer getCountByXls(InputStream is){
        Integer count = 0;
        POIFSFileSystem fs = null;
        try {
            fs = new POIFSFileSystem(is);
            HSSFWorkbook hssfWorkbook=new HSSFWorkbook(fs);
            HSSFSheet sheet=hssfWorkbook.getSheetAt(0);
            Iterator iterator_row=sheet.rowIterator();
            int i=0;
            while(iterator_row.hasNext()){
                HSSFRow row=(HSSFRow) iterator_row.next();
                i++;
                Iterator iterator_cell=row.cellIterator();
                while(iterator_cell.hasNext()) {
                    HSSFCell cell = (HSSFCell) iterator_cell.next();
                    count += getCellLength(cell);
                }
            }
            is.close();
        } catch (Exception e) {
            e.printStackTrace();
        }
        return count;
    }

    private static Integer getCountByXlsx(InputStream is){
        Integer count = 0;
        try {
            Workbook workbook =new XSSFWorkbook(is);
            Sheet sheet = workbook.getSheetAt(0);
            for (Row row : sheet) {
                for (Cell cell : row) {
                    count += getCellLength(cell);
                }
            }
            is.close();
        } catch (Exception e) {
            e.printStackTrace();
        }
        return count;
    }

    private static Integer getCellLength(Cell cell){
        Integer count = 0;
        if(cell.getCellType() == CellType.STRING){
            count = cell.getStringCellValue().trim().length();
        } else if(cell.getCellType() == CellType.NUMERIC){
            count = String.valueOf(cell.getNumericCellValue()).trim().length();
        } else if(cell.getCellType() == CellType.BOOLEAN){
            count = 1;
        }
        return count;
    }

    public static Integer getCountByPdf(InputStream is){
        Integer countNum = 0;
        try {
            PDDocument pdDocument = PDDocument.load(is);
            PDFTextStripperByArea stripperByArea = new PDFTextStripperByArea();
            stripperByArea.setSortByPosition(true);
            PDFTextStripper textStripper = new PDFTextStripper();
            String pdfFileInText = textStripper.getText(pdDocument);
            String[] lines = pdfFileInText.split("\\r?\\n");
            for (String line : lines){
                countNum += line.length();
            }

            is.close();
        } catch (Exception e) {
            e.printStackTrace();
        }
        return countNum;
    }

    public static Integer getCountByPPT(InputStream is){
        Integer countNum = 0;
        try {
            HSLFSlideShow slideShow = new HSLFSlideShow(is);
            List<HSLFSlide> slides = slideShow.getSlides();
            for (HSLFSlide slide : slides) {
                List<HSLFShape> shapes = slide.getShapes();
                for (HSLFShape shape : shapes) {
                    if (shape instanceof HSLFTextShape){
                        String text = ((HSLFTextShape) shape).getText().trim();
                        if (StringUtils.isNotBlank(text)){
                            countNum += text.length();
                        }
                    }
                    if (shape instanceof HSLFTable){
                        //遍历单元格行
                        for (int i = 0; i < ((HSLFTable) shape).getNumberOfRows(); i++) {
                            //遍历单元格列
                            for (int j = 0; j < ((HSLFTable) shape).getNumberOfColumns(); j++) {
                                // 根据行列坐标取单元格
                                HSLFTableCell cell = ((HSLFTable) shape).getCell(i, j);
                                String text = cell.getText().trim();
                                if (StringUtils.isNotBlank(text)){
                                    countNum += text.length();
                                }
                            }
                        }
                    }
                }
            }
            is.close();
        } catch (Exception e) {
            e.printStackTrace();
        }
        return countNum;
    }

    public static Integer getCountByPPTX(InputStream is){
        Integer countNum = 0;
        try {
            XMLSlideShow xss = new XMLSlideShow(is);
            List<XSLFSlide> slides = xss.getSlides();
            for (XSLFSlide slide : slides) {
                List<XSLFShape> shapes = slide.getShapes();
                for (XSLFShape shape : shapes) {
                    if (shape instanceof XSLFTextShape) {
                        //文本框
                        String text = ((XSLFTextShape) shape).getText().trim();
                        if (StringUtils.isNotBlank(text)){
                            countNum += text.length();
                        }
                    }
                    if (shape instanceof XSLFTable) {
                        //表格
                        for (XSLFTableRow row : ((XSLFTable) shape).getRows()) {
                            for (XSLFTableCell cell : row.getCells()) {
                                String text = cell.getText().trim();
                                if (StringUtils.isNotBlank(text)){
                                    countNum += text.length();
                                }
                            }
                        }
                    }
                }
            }
            is.close();
        } catch (Exception e) {
            e.printStackTrace();
        }
        return countNum;
    }

}

`合并单元格`

//合并单元格
        if (null != mergeParams && mergeParams.size() > 0){
            for (List<Integer> list : mergeParams) {
                //参数1：起始行 参数2：终止行 参数3：起始列 参数4：终止列
                sheet.addMergedRegion(new CellRangeAddress(list.get(0), list.get(1), list.get(2), list.get(3)));
            }
        }