本文是从一个大佬那里摘过来的,忘记是哪个了,在这里向大佬道个歉,并表示感谢,感谢大佬的技术分享,我就厚着脸皮发布了!😉
1.所需依赖
<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>pdfbox</artifactId>
<version>2.0.2</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi</artifactId>
<version>3.15</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-ooxml</artifactId>
<version>3.15</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-ooxml-schemas</artifactId>
<version>3.15</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-scratchpad</artifactId>
<version>3.15</version>
</dependency>
<dependency>
<groupId>org.apache.xmlbeans</groupId>
<artifactId>xmlbeans</artifactId>
<version>2.6.0</version>
</dependency>
<dependency>
<groupId>dom4j</groupId>
<artifactId>dom4j</artifactId>
<version>1.6.1</version>
</dependency>
2.代码示例
/**
* 读取doc文件
* @param filePath
* @throws Exception
*/
public static String getTextFromDoc(String filePath) throws Exception{
StringBuilder sb = new StringBuilder();
FileInputStream fis = new FileInputStream(new File(filePath));
HWPFDocument doc = new HWPFDocument(fis);
Range rang = doc.getRange();
sb.append(rang.text());
fis.close();
return sb.toString();
}
/**
* 读取docx文件
* @param filePath
* @throws IOException
*/
public static String getTextFromDocx(String filePath) throws IOException {
FileInputStream in = new FileInputStream(filePath);
XWPFDocument doc = new XWPFDocument(in);
XWPFWordExtractor extractor = new XWPFWordExtractor(doc);
String text = extractor.getText();
in.close();
return text;
}
/**
* 读取pdf文件
* @param filePath
* @throws IOException
*/
public static String getTextFromPDF(String filePath) throws IOException{
File input = new File(filePath);
PDDocument pd = PDDocument.load(input);
PDFTextStripper stripper = new PDFTextStripper();
return stripper.getText(pd);
}
/**
* 读取ppt文件
* @param filePath
* @throws IOException
*/
public static String getTextFromPPT( String filePath) throws IOException{
FileInputStream in = new FileInputStream(filePath);
PowerPointExtractor extractor = new PowerPointExtractor(in);
String content = extractor.getText();
extractor.close();
return content;
}
/**
* 读取pptx文件
* @param filePath
* @throws IOException
*/
public static String getTextFromPPTX( String filePath) throws IOException{
String resultString = null;
StringBuilder sb = new StringBuilder();
FileInputStream in = new FileInputStream(filePath);
try {
XMLSlideShow xmlSlideShow = new XMLSlideShow(in);
List<XSLFSlide> slides = xmlSlideShow.getSlides();
for(XSLFSlide slide:slides){
CTSlide rawSlide = slide.getXmlObject();
CTGroupShape gs = rawSlide.getCSld().getSpTree();
CTShape[] shapes = gs.getSpArray();
for(CTShape shape:shapes){
CTTextBody tb = shape.getTxBody();
if(null==tb){
continue;
}
CTTextParagraph[] paras = tb.getPArray();
for(CTTextParagraph textParagraph:paras){
CTRegularTextRun[] textRuns = textParagraph.getRArray();
for(CTRegularTextRun textRun:textRuns){
sb.append(textRun.getT());
}
}
}
}
resultString = sb.toString();
xmlSlideShow.close();
} catch (Exception e) {
e.printStackTrace();
}
return resultString;
}
/**
* 读取xls
* @param filePath
* @throws IOException
*/
public static String getTextFromxls(String filePath) throws IOException{
FileInputStream in = new FileInputStream(filePath);
StringBuilder content = new StringBuilder();
HSSFWorkbook workbook = new HSSFWorkbook(in);
for(int sheetIndex=0;sheetIndex<workbook.getNumberOfSheets();sheetIndex++){
HSSFSheet sheet = workbook.getSheetAt(sheetIndex);
for(int rowIndex=0;rowIndex<=sheet.getLastRowNum();rowIndex++){
HSSFRow row = sheet.getRow(rowIndex);
if(row==null){
continue;
}
for(int cellnum=0;cellnum<row.getLastCellNum();cellnum++){
HSSFCell cell = row.getCell(cellnum);
if(cell!=null){
content.append(cell.getRichStringCellValue().getString()+" ");
}
}
}
}
workbook.close();
return content.toString();
}
/**
* 用来读取xlsx文件
* @param filePath
* @throws IOException
*/
public static String getTextFromxlsx(String filePath) throws IOException{
StringBuilder content = new StringBuilder();
XSSFWorkbook workbook = new XSSFWorkbook(filePath);
for(int sheet=0;sheet<workbook.getNumberOfSheets();sheet++){
if(null!=workbook.getSheetAt(sheet)){
XSSFSheet aSheet =workbook.getSheetAt(sheet);
for(int row=0;row<=aSheet.getLastRowNum();row++){
if(null!=aSheet.getRow(row)){
XSSFRow aRow = aSheet.getRow(row);
for(int cell=0;cell<aRow.getLastCellNum();cell++){
if(null!=aRow.getCell(cell)){
XSSFCell aCell = aRow.getCell(cell);
if(convertCell(aCell).length()>0){
content.append(convertCell(aCell));
}
}
content.append(" ");
}
}
}
}
}
workbook.close();
return content.toString();
}
private static String (Cell cell){
NumberFormat formater = NumberFormat.getInstance();
formater.setGroupingUsed(false);
String cellValue="";
if(cell==null){
return cellValue;
}
switch(cell.getCellType()){
case HSSFCell.CELL_TYPE_NUMERIC:
cellValue = formater.format(cell.getNumericCellValue());
break;
case HSSFCell.CELL_TYPE_STRING:
cellValue = cell.getStringCellValue();
break;
case HSSFCell.CELL_TYPE_BLANK:
cellValue = cell.getStringCellValue();
break;
case HSSFCell.CELL_TYPE_BOOLEAN:
cellValue = Boolean.valueOf(cell.getBooleanCellValue()).toString();
break;
case HSSFCell.CELL_TYPE_ERROR:
cellValue = String.valueOf(cell.getErrorCellValue());
break;
default:cellValue="";
}
return cellValue.trim();
}