本文通过开源pdfbox和poi进行处理多种文件格式的文本读入
1.需要的jar的maven坐标:
<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>pdfbox</artifactId>
<version>2.0.2</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi</artifactId>
<version>4.1.0</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-ooxml</artifactId>
<version>4.1.0</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-ooxml-schemas</artifactId>
<version>4.1.0</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-scratchpad</artifactId>
<version>4.1.0</version>
</dependency>
处理多种文件格式,详情见代码:
import java.io.*;
import java.text.SimpleDateFormat;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.poi.hslf.extractor.PowerPointExtractor;
import org.apache.poi.hslf.usermodel.HSLFSlideShowImpl;
import org.apache.poi.hssf.usermodel.*;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.usermodel.Range;
import org.apache.poi.xssf.usermodel.XSSFCell;
import org.apache.poi.xssf.usermodel.XSSFRow;
import org.apache.poi.xssf.usermodel.XSSFSheet;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import static org.apache.poi.ss.usermodel.CellType.NUMERIC;
import static org.apache.poi.ss.usermodel.CellType.STRING;
/**
*
* @author yujian
* @date 2016年10月12日
* @version 0.0.1
*/
public class FileFormat {
/**
* 用来读取doc文件的方法
* @param filePath
* @return
* @throws Exception
*/
public static String getTextFromDoc(String filePath) throws Exception {
StringBuilder sb = new StringBuilder();
FileInputStream fis = new FileInputStream(new File(filePath));
HWPFDocument doc = new HWPFDocument(fis);
Range rang = doc.getRange();
sb.append(rang.text());
fis.close();
return sb.toString().replaceAll("\\s*", "");
}
/**
* 用来读取txt文件
* @param filePath
* @return
* @throws IOException
* @throws Exception
*/
@SuppressWarnings("resource")
public static String getTextFromTxt(String filePath) {
BufferedReader reader = null;
StringBuffer sbf = new StringBuffer();
try {
//以utf-8格式读取txt文件文本
reader = new BufferedReader(new InputStreamReader(new FileInputStream(new File(filePath)), "UTF-8"));
String tempStr;
while ((tempStr = reader.readLine()) != null) {
sbf.append(tempStr);
}
reader.close();
return sbf.toString();
} catch (IOException e) {
e.printStackTrace();
} finally {
if (reader != null) {
try {
reader.close();
} catch (IOException e1) {
e1.printStackTrace();
}
}
}
//.replaceAll("\\s*", "") 清楚字符串中的所有空
return sbf.toString().replaceAll("\\s*", "");
}
/**
* 用来读取docx文件
* @param filePath
* @return
* @throws IOException
* @throws Exception
*/
@SuppressWarnings("resource")
public static String getTextFromDocx(String filePath) throws IOException {
FileInputStream in = new FileInputStream(filePath);
XWPFDocument doc = new XWPFDocument(new FileInputStream(filePath));
XWPFWordExtractor extractor = new XWPFWordExtractor(doc);
String text = extractor.getText();
in.close();
return text.replaceAll("\\s*", "");
}
/**
* 用来读取pdf文件
* @param filePath
* @return
* @throws IOException
*/
public static String getTextFromPDF(String filePath) throws IOException {
File input = new File(filePath);
PDDocument pd = PDDocument.load(input);
PDFTextStripper stripper = new PDFTextStripper();
String text = stripper.getText(pd);
pd.close();
return text.replaceAll("\\s*", "");
}
/**
* 用来读取ppt或pptx文件
* @param filePath
* @return
* @throws IOException
*/
public static String getTextFromPPTOrPPTX(String filePath) throws IOException {
PowerPointExtractor extractor = new PowerPointExtractor(new HSLFSlideShowImpl(filePath));
String text = extractor.getText();
extractor.close();
return text.replaceAll("\\s*", "");
}
/**
* 用来读取xls
* @param filePath
* @return
* @throws IOException
*/
public static String getTextFromxls(String filePath) throws IOException {
FileInputStream in = new FileInputStream(filePath);
StringBuilder content = new StringBuilder();
HSSFWorkbook workbook = new HSSFWorkbook(in);
for (int sheetIndex = 0; sheetIndex < workbook.getNumberOfSheets(); sheetIndex++) {
HSSFSheet sheet = workbook.getSheetAt(sheetIndex);
for (int rowIndex = 0; rowIndex <= sheet.getLastRowNum(); rowIndex++) {
HSSFRow row = sheet.getRow(rowIndex);
if (row == null) {
continue;
}
for (int i = 0; i < row.getLastCellNum(); i++) {
HSSFCell cell = row.getCell(i);
if (cell != null) {
//判断是否是日期,是日期转换成 yyyy-MM-dd HH:mm:ss 格式
if (cell.getCellType() == NUMERIC && HSSFDateUtil.isCellDateFormatted(cell)) {
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
String date = sdf.format(cell.getDateCellValue());
content.append(date.trim());
} else {
cell.setCellType(STRING);
content.append(cell.getStringCellValue().trim());
}
content.append(" ");
}
}
}
}
in.close();
workbook.close();
return content.toString();
}
/**
* 用来读取xlsx文件
* @param filePath
* @return
* @throws IOException
*/
public static String getTextFromxlsx(String filePath) throws IOException {
StringBuilder content = new StringBuilder();
XSSFWorkbook workbook = new XSSFWorkbook(filePath);
for (int sheet = 0; sheet < workbook.getNumberOfSheets(); sheet++) {
if (null != workbook.getSheetAt(sheet)) {
XSSFSheet aSheet = workbook.getSheetAt(sheet);
for (int row = 0; row <= aSheet.getLastRowNum(); row++) {
if (null != aSheet.getRow(row)) {
XSSFRow aRow = aSheet.getRow(row);
for (int cell = 0; cell < aRow.getLastCellNum(); cell++) {
if (null != aRow.getCell(cell)) {
XSSFCell aCell = aRow.getCell(cell);
if (aCell != null) {
//判断是否是日期,是日期转换成 yyyy-MM-dd HH:mm:ss 格式
if (aCell.getCellType() == NUMERIC && HSSFDateUtil.isCellDateFormatted(aCell)) {
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
String date = sdf.format(aCell.getDateCellValue());
content.append(date.trim());
} else {
aCell.setCellType(STRING);
content.append(aCell.getStringCellValue().trim());
}
content.append(" ");
}
}
}
}
}
}
}
workbook.close();
return content.toString();
}
}