用java读取多种文件格式的文件(pdf,pptx,ppt,doc,docx..)

本文通过开源pdfbox和poi进行处理多种文件格式的文本读入

1.需要的jar的maven坐标:

<dependency>
            <groupId>org.apache.pdfbox</groupId>
            <artifactId>pdfbox</artifactId>
            <version>2.0.2</version>
        </dependency>
        <dependency>
            <groupId>org.apache.poi</groupId>
            <artifactId>poi</artifactId>
            <version>4.1.0</version>
        </dependency>
        <dependency>
            <groupId>org.apache.poi</groupId>
            <artifactId>poi-ooxml</artifactId>
            <version>4.1.0</version>
        </dependency>
        <dependency>
            <groupId>org.apache.poi</groupId>
            <artifactId>poi-ooxml-schemas</artifactId>
            <version>4.1.0</version>
        </dependency>
        <dependency>
            <groupId>org.apache.poi</groupId>
            <artifactId>poi-scratchpad</artifactId>
            <version>4.1.0</version>
        </dependency>

处理多种文件格式,详情见代码:

import java.io.*;
import java.text.SimpleDateFormat;

import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.poi.hslf.extractor.PowerPointExtractor;
import org.apache.poi.hslf.usermodel.HSLFSlideShowImpl;
import org.apache.poi.hssf.usermodel.*;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.usermodel.Range;
import org.apache.poi.xssf.usermodel.XSSFCell;
import org.apache.poi.xssf.usermodel.XSSFRow;
import org.apache.poi.xssf.usermodel.XSSFSheet;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
import org.apache.poi.xwpf.usermodel.XWPFDocument;

import static org.apache.poi.ss.usermodel.CellType.NUMERIC;
import static org.apache.poi.ss.usermodel.CellType.STRING;

/**
 *
 * @author yujian
 * @date   2016年10月12日
 * @version 0.0.1
 */
public class FileFormat {

    /**
     * 用来读取doc文件的方法
     * @param filePath
     * @return
     * @throws Exception
     */
    public static String getTextFromDoc(String filePath) throws Exception {
        StringBuilder sb = new StringBuilder();
        FileInputStream fis = new FileInputStream(new File(filePath));
        HWPFDocument doc = new HWPFDocument(fis);
        Range rang = doc.getRange();
        sb.append(rang.text());
        fis.close();
        return sb.toString().replaceAll("\\s*", "");
    }

    /**
     * 用来读取txt文件
     * @param filePath
     * @return
     * @throws IOException
     * @throws Exception
     */
    @SuppressWarnings("resource")
    public static String getTextFromTxt(String filePath) {
        BufferedReader reader = null;
        StringBuffer sbf = new StringBuffer();
        try {
            //以utf-8格式读取txt文件文本
            reader = new BufferedReader(new InputStreamReader(new FileInputStream(new File(filePath)), "UTF-8"));
            String tempStr;
            while ((tempStr = reader.readLine()) != null) {
                sbf.append(tempStr);
            }
            reader.close();
            return sbf.toString();
        } catch (IOException e) {
            e.printStackTrace();
        } finally {
            if (reader != null) {
                try {
                    reader.close();
                } catch (IOException e1) {
                    e1.printStackTrace();
                }
            }
        }
        //.replaceAll("\\s*", "") 清楚字符串中的所有空
        return sbf.toString().replaceAll("\\s*", "");
    }

    /**
     * 用来读取docx文件
     * @param filePath
     * @return
     * @throws IOException
     * @throws Exception
     */
    @SuppressWarnings("resource")
    public static String getTextFromDocx(String filePath) throws IOException {
        FileInputStream in = new FileInputStream(filePath);
        XWPFDocument doc = new XWPFDocument(new FileInputStream(filePath));
        XWPFWordExtractor extractor = new XWPFWordExtractor(doc);
        String text = extractor.getText();
        in.close();
        return text.replaceAll("\\s*", "");
    }

    /**
     * 用来读取pdf文件
     * @param filePath
     * @return
     * @throws IOException
     */
    public static String getTextFromPDF(String filePath) throws IOException {
        File input = new File(filePath);
        PDDocument pd = PDDocument.load(input);
        PDFTextStripper stripper = new PDFTextStripper();
        String text = stripper.getText(pd);
        pd.close();
        return text.replaceAll("\\s*", "");
    }

    /**
     * 用来读取ppt或pptx文件
     * @param filePath
     * @return
     * @throws IOException
     */
    public static String getTextFromPPTOrPPTX(String filePath) throws IOException {
        PowerPointExtractor extractor = new PowerPointExtractor(new HSLFSlideShowImpl(filePath));
        String text = extractor.getText();
        extractor.close();
        return text.replaceAll("\\s*", "");
    }

    /**
     * 用来读取xls
     * @param filePath
     * @return
     * @throws IOException
     */
    public static String getTextFromxls(String filePath) throws IOException {
        FileInputStream in = new FileInputStream(filePath);
        StringBuilder content = new StringBuilder();
        HSSFWorkbook workbook = new HSSFWorkbook(in);
        for (int sheetIndex = 0; sheetIndex < workbook.getNumberOfSheets(); sheetIndex++) {
            HSSFSheet sheet = workbook.getSheetAt(sheetIndex);
            for (int rowIndex = 0; rowIndex <= sheet.getLastRowNum(); rowIndex++) {
                HSSFRow row = sheet.getRow(rowIndex);
                if (row == null) {
                    continue;
                }
                for (int i = 0; i < row.getLastCellNum(); i++) {
                    HSSFCell cell = row.getCell(i);
                    if (cell != null) {
                        //判断是否是日期,是日期转换成 yyyy-MM-dd HH:mm:ss 格式
                        if (cell.getCellType() == NUMERIC && HSSFDateUtil.isCellDateFormatted(cell)) {
                            SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
                            String date = sdf.format(cell.getDateCellValue());
                            content.append(date.trim());
                        } else {
                            cell.setCellType(STRING);
                            content.append(cell.getStringCellValue().trim());
                        }
                        content.append(" ");
                    }
                }
            }
        }
        in.close();
        workbook.close();
        return content.toString();
    }

    /**
     * 用来读取xlsx文件
     * @param filePath
     * @return
     * @throws IOException
     */
    public static String getTextFromxlsx(String filePath) throws IOException {
        StringBuilder content = new StringBuilder();
        XSSFWorkbook workbook = new XSSFWorkbook(filePath);
        for (int sheet = 0; sheet < workbook.getNumberOfSheets(); sheet++) {
            if (null != workbook.getSheetAt(sheet)) {
                XSSFSheet aSheet = workbook.getSheetAt(sheet);
                for (int row = 0; row <= aSheet.getLastRowNum(); row++) {
                    if (null != aSheet.getRow(row)) {
                        XSSFRow aRow = aSheet.getRow(row);
                        for (int cell = 0; cell < aRow.getLastCellNum(); cell++) {
                            if (null != aRow.getCell(cell)) {
                                XSSFCell aCell = aRow.getCell(cell);
                                if (aCell != null) {
                                    //判断是否是日期,是日期转换成 yyyy-MM-dd HH:mm:ss 格式
                                    if (aCell.getCellType() == NUMERIC && HSSFDateUtil.isCellDateFormatted(aCell)) {
                                        SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
                                        String date = sdf.format(aCell.getDateCellValue());
                                        content.append(date.trim());
                                    } else {
                                        aCell.setCellType(STRING);
                                        content.append(aCell.getStringCellValue().trim());
                                    }
                                    content.append(" ");
                                }
                            }

                        }
                    }
                }
            }
        }
        workbook.close();
        return content.toString();
    }
}
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值