读取文件（txt,doc,docx,xlsx,xls,pdf）内容并转化为base64编码

呆。@473

已于 2022-04-08 11:13:39 修改

阅读量3.3k

点赞数 1

分类专栏： java 文章标签： java maven

于 2022-04-08 11:08:44 首次发布

本文链接：https://blog.csdn.net/weixin_60382322/article/details/124035473

版权

java 专栏收录该内容

6 篇文章 1 订阅

订阅专栏

将文件（txt,doc,docx,xlsx,xls,pdf）内容转化为base64编码;读取文件的内容；

运行结果
运行效果

1.添加maven依赖版本自行更改

        <dependency>
           <groupId>org.apache.pdfbox</groupId>
            <artifactId>pdfbox</artifactId>
            <version>1.8.8</version>
        </dependency>
        <dependency>
            <groupId>com.itextpdf</groupId>
            <artifactId>itextpdf</artifactId>
            <version>5.0.6</version>
        </dependency>
        <dependency>
            <groupId>org.apache.poi</groupId>
            <artifactId>poi</artifactId>
            <version>3.16</version>
        </dependency>
        <dependency>
            <groupId>org.apache.poi</groupId>
            <artifactId>poi-examples</artifactId>
            <version>3.16</version>
        </dependency>
        <dependency>
            <groupId>org.apache.poi</groupId>
            <artifactId>poi-excelant</artifactId>
            <version>3.16</version>
        </dependency>
        <dependency>
            <groupId>org.apache.poi</groupId>
            <artifactId>poi-ooxml</artifactId>
            <version>3.16</version>
        </dependency>
        <dependency>
            <groupId>org.apache.poi</groupId>
            <artifactId>poi-ooxml-schemas</artifactId>
            <version>3.16</version>
        </dependency>
        <dependency>
            <groupId>org.apache.poi</groupId>
            <artifactId>poi-scratchpad</artifactId>
            <version>3.16</version>
        </dependency>

2.在domain创建文件实体类fileDTO

public class fileDTO {

    private String id;
    private String name;
    private String type;
    private String content;

    public fileDTO() {}

    public fileDTO(String id, String name, String type, String content) {
        this.id = id;
        this.name = name;
        this.type = type;
        this.content = content;
    }

    public String getId() {
        return id;
    }

    public void setId(String id) {
        this.id = id;
    }

    public String getName() {
        return name;
    }

    public void setName(String name) {
        this.name = name;
    }

    public String getType() {
        return type;
    }

    public void setType(String type) {
        this.type = type;
    }

    public String getContent() {
        return content;
    }

    public void setContent(String content) {
        this.content = content;
    }

    @Override
    public String toString() {
        return "[ id: "+id+" name: "+name +" type: "+ type  +"  content:  "+content+"]";
    }
}

3.根据文件类型读取其文件内容，并将内容转化为base64编码

 //将文件内容转化为base64编码
    void readFile() throws IOException {
    //文件路径
        File file = new File("D:/test/测试.pdf");

        fileDTO filedto = new fileDTO();
        filedto.setName(file.getName());
        System.out.println(filedto.getName());
        filedto.setType(file.getName().substring(file.getName().lastIndexOf(".") + 1));
//        System.out.println(filedto.getType());
        byte[] bytes = null;
        //根据类型读取文件内容
        //docx
        if(filedto.getType().equals("docx")){
            System.out.println("docx");
             bytes = getdocxContent(file.getPath());
        }
        //txt
        else if(filedto.getType().equals("txt")){
            System.out.println("txt");
             bytes = gettxtContent(file);
        }
        //doc
        else if(filedto.getType().equals("doc")){
            System.out.println("doc");
            bytes = getdocContent(file.getPath());
        }
        //xlsx
        else if(filedto.getType().equals("xlsx")){
            System.out.println("xlsx");
            bytes = getxlsxContent(file.getPath());
        }
        //xls
        else if(filedto.getType().equals("xls")){
            System.out.println("xls");
            bytes = getxlsContent(file.getPath());
        }
        //pdf
        else if (filedto.getType().equals("pdf")){
            System.out.println("pdf");
            bytes = getpdfContent(file.getPath());
        }

        String base64 = Base64.getEncoder().encodeToString(bytes);
        filedto.setContent(base64);
        System.out.println(filedto.getContent());
    }

4.若是只读取内容，不转base64，只需要一下即可（以gettxtContent为例）

String gettxtContent(File filePath) throws IOException {
    byte[] bytes = Files.readAllBytes(Paths.get(String.valueOf(filePath)));
    String content = new String(bytes, StandardCharsets.UTF_8);
    //byte[] bytes1 = content.getBytes();
    return content;
}

读取文件内容,并转base64,返回byte[]类型


 /**
     * 获取txt文件内容
     * @param filePath
     * @return byte content
     * @throws IOException
     */
    byte[] gettxtContent(File filePath) throws IOException {

        byte[] bytes = Files.readAllBytes(Paths.get(String.valueOf(filePath)));
        String content = new String(bytes, StandardCharsets.UTF_8);
        byte[] bytes1 = content.getBytes();
        return bytes1;
    }

    /**
     * 获取docx文件内容
     * @param path
     * @return 字节内容
     */
    byte[] getdocxContent(String path){
        //读取docx文件路径
        OPCPackage opcPackage = null;
        String content = null;
        List<String> docxList = new ArrayList<String>();
        try {
            opcPackage = POIXMLDocument.openPackage(path);
            XWPFDocument xwpf = new XWPFDocument(opcPackage);
            POIXMLTextExtractor poiText = new XWPFWordExtractor(xwpf);
            content = poiText.getText();
            docxList.add(content);
        } catch (IOException e) {
            e.printStackTrace();
        }
        System.out.println(docxList);
         String ss = String.join(",", docxList);
         byte[] bytes1 =ss.getBytes();
        return bytes1;
    }

    //读取word文档中，doc后缀的文件
    byte[] getdocContent(String filePath){
            List<String> docList = new ArrayList<String>();
            String content=null;
            //读取字节流，读取文件路径
            InputStream input = null;
            try {
                input = new FileInputStream(new File(filePath));
                WordExtractor wex = new WordExtractor(input);
                content = wex.getText();
                //System.out.println(content);
                docList.add(content);
            } catch (Exception e) {
                e.printStackTrace();
            }
        System.out.println(docList);
            String ss = String.join(",",docList);
            byte[] bytes1 = ss.getBytes();
            return bytes1;
    }


    /**
     * 获取xlsx的内容
     * @param filePath
     * @return
     */
    byte[] getxlsxContent(String filePath){
        List<String> list = new ArrayList<>();
        //用流的方式读取
        FileInputStream fis;
        try{
            fis = new FileInputStream(new File(filePath));
            //获取整个excel
            XSSFWorkbook hb = new XSSFWorkbook(fis);
            System.out.println(hb.getNumCellStyles());
            //获取第一个表单sheet
            Sheet sheet  = hb.getSheetAt(0);
            //获取第一行
            int firstrow = sheet.getFirstRowNum();
            //最后一行
            int lastrow = sheet.getLastRowNum();
            //循环行数依次获取列数
            for(int i= firstrow;i<lastrow;i++){
                Row row = sheet.getRow(i);
                if(row != null){
                    //获取这一行的第一列
                    int firstcell = row.getFirstCellNum();
                    //获取这一行的最后一列
                    int lastcell = row.getLastCellNum();
                    for(int j = firstcell;j<lastcell;j++){
                        //获取第j列
                        Cell cell = row.getCell(j);
                        if (cell != null ){
                            System.out.println(cell.toString());
                            list.add(cell.toString());
                        }
                    }
                    System.out.println();
                }
            }
        } catch (Exception e) {
            e.printStackTrace();
        }
        System.out.println(list);
        String ss = String.join(",",list);
        byte[] bytes1 =ss.getBytes();

        return bytes1;
    }


    /**
     * 获取xls的内容
     * @param filePath
     * @return
     */
    byte[] getxlsContent(String filePath){
        List<String> list = new ArrayList<>();

        try{
            //解析excel
            POIFSFileSystem pSystem = new POIFSFileSystem(new File(filePath));
            //获取整个excel
            HSSFWorkbook hb = new HSSFWorkbook(pSystem);
            System.out.println(hb.getNumCellStyles());
            //获取第一个人表单sheet
            HSSFSheet sheet = hb.getSheetAt(0);
            //获取第一行
            int firstrow = sheet.getFirstRowNum();
            //获取最后一行
            int lastrow = sheet.getLastRowNum();
            for (int i =firstrow;i<lastrow;i++){
                //获取那一行i
                HSSFRow row = sheet.getRow(i);
                if (row != null) {
                    //获取此行第一列
                    int firstcell = row.getFirstCellNum();
                    //最后一列
                    int lastcell = row.getLastCellNum();
                    for (int j =firstcell;j<lastcell;j++) {
                        //获取第j列
                        HSSFCell cell = row.getCell(j);
                        if(cell != null) {
                            list.add(cell.toString());
                        }
                    }
                }
            }
        } catch (Exception e) {
            e.printStackTrace();
        }
        System.out.println(list);
        String ss= String.join(",",list);
        byte[] bytes1 = ss.getBytes();

        return bytes1;
    }


    /**
     * 获取pdf的内容
     * @param filePath
     * @return
     */
    byte[] getpdfContent(String filePath) {
        boolean sort = false; //是否排序
        int startPage = 1; //开始提取页数
        int endPage = Integer.MAX_VALUE; //结束提取页数
        String content = null; //暂时存放pdf内容

        InputStream input = null;
        File file = new File(filePath);
        PDDocument document = null;
        try{
            input = new FileInputStream(file);
            //加载pdf文档
            PDFParser parser = new PDFParser(input);
            parser.parse();
            document = parser.getPDDocument();
            //获取内容信息
            PDFTextStripper pts = new PDFTextStripper();
            pts.setSortByPosition(sort);

            endPage = document.getNumberOfPages();

            pts.setStartPage(startPage);
            pts.setEndPage(endPage);
            try{
                content = pts.getText(document);
                System.out.println(content);
            }catch (Exception e){
                e.printStackTrace();
            }
        } catch (Exception e) {
            e.printStackTrace();
        }

        String ss = String.join(",",content);
        byte[] bytes1= ss.getBytes();

        return bytes1;
    }

呆。@473

关注

1
点赞
踩
7

收藏

觉得还不错? 一键收藏
0
评论
读取文件（txt,doc,docx,xlsx,xls,pdf）内容并转化为base64编码

将文件（txt,doc,docx,xlsx,xls,pdf）内容转化为base64编码;读取文件的内容；运行结果1.添加maven依赖版本自行更改 <dependency> <groupId>org.apache.pdfbox</groupId> <artifactId>pdfbox</artifactId> <version>1.8.8&lt
复制链接

扫一扫

专栏目录