JAVA提取Word,Excel,PPT,PDF,TXT等文档文字内容

最新推荐文章于 2024-04-18 09:39:19 发布

wooden_fish丶

最新推荐文章于 2024-04-18 09:39:19 发布

阅读量5.2k

点赞数 2

文章标签： java word excel 文档 pdf

本文链接：https://blog.csdn.net/a464587444/article/details/56840220

版权

首先引入Maven库

		<dependency>
			<groupId>org.apache.poi</groupId>
			<artifactId>poi</artifactId>
			<version>3.15</version>
		</dependency>
		<dependency>
			<groupId>org.apache.poi</groupId>
			<artifactId>poi-ooxml</artifactId>
			<version>3.15</version>
		</dependency>
		<dependency>
			<groupId>org.apache.poi</groupId>
			<artifactId>poi-scratchpad</artifactId>
			<version>3.15</version>
		</dependency>
		<dependency>
			<groupId>org.apache.pdfbox</groupId>
			<artifactId>pdfbox</artifactId>
			<version>2.0.4</version>
		</dependency>

public class ParseText {

	// 判断文档类型，调用不同的解析方法
	public static String parse(byte[] buffer, String suffix) {
		String text = "";
		switch (suffix) {
		case "doc":
			text = getTextFromWord(buffer);
			break;
		case "docx":
			text = getTextFromWord2007(buffer);
			break;
		case "xls":
			text = getTextFromExcel(buffer);
			break;
		case "xlsx":
			text = getTextFromExcel2007(buffer);
			break;
		case "ppt":
			text = getTextFromPPT(buffer);
			break;
		case "pptx":
			text = getTextFromPPT2007(buffer);
			break;
		case "pdf":
			text = getTextFormPDF(buffer);
			break;
		case "txt":
			text = getTextFormTxt(buffer);
			break;
		default:
			System.out.println("不支持解析的文档类型");
		}

		return text.replaceAll("\\s*", "");
	}

	// 读取Word97-2003的全部内容 doc
	private static String getTextFromWord(byte[] file) {
		String text = "";
		InputStream fis = null;
		WordExtractor ex = null;
		try {
			// word 2003： 图片不会被读取
			fis = new ByteArrayInputStream(file);
			ex = new WordExtractor(fis);
			text = ex.getText();
			ex.close();
		} catch (Exception e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
		return text;
	}

	// 读取Word2007+的全部内容 docx
	private static String getTextFromWord2007(byte[] file) {
		String text = "";
		InputStream fis = null;
		XWPFDocument doc = null;
		XWPFWordExtractor workbook = null;
		try {
			fis = new ByteArrayInputStream(file);
			doc = new XWPFDocument(fis);
			workbook = new XWPFWordExtractor(doc);
			text = workbook.getText();
			workbook.close();
		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
		return text;
	}

	// 读取Excel97-2003的全部内容 xls
	private static String getTextFromExcel(byte[] file) {
		InputStream is = null;
		HSSFWorkbook wb = null;
		String text = "";
		try {
			is = new ByteArrayInputStream(file);
			wb = new HSSFWorkbook(new POIFSFileSystem(is));
			ExcelExtractor extractor = new ExcelExtractor(wb);
			extractor.setFormulasNotResults(false);
			extractor.setIncludeSheetNames(false);
			text = extractor.getText();
			extractor.close();
		} catch (IOException e) {
			e.printStackTrace();
		}
		return text;
	}

	// 读取Excel2007+的全部内容 xlsx
	private static String getTextFromExcel2007(byte[] file) {
		InputStream is = null;
		XSSFWorkbook workBook = null;
		String text = "";
		try {
			is = new ByteArrayInputStream(file);
			workBook = new XSSFWorkbook(is);
			XSSFExcelExtractor extractor = new XSSFExcelExtractor(workBook);
			extractor.setIncludeSheetNames(false);
			text = extractor.getText();
			extractor.close();
		} catch (IOException e) {
			e.printStackTrace();
		}
		return text;
	}

	// 读取Powerpoint97-2003的全部内容 ppt
	private static String getTextFromPPT(byte[] file) {
		String text = "";
		InputStream fis = null;
		PowerPointExtractor ex = null;
		try {
			// word 2003： 图片不会被读取
			fis = new ByteArrayInputStream(file);
			ex = new PowerPointExtractor(fis);
			text = ex.getText();
			ex.close();
		} catch (Exception e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
		return text;
	}

	// 抽取幻灯片2007+全部内容 pptx
	private static String getTextFromPPT2007(byte[] file) {
		InputStream is = null;
		XMLSlideShow slide = null;
		String text = "";
		try {
			is = new ByteArrayInputStream(file);
			slide = new XMLSlideShow(is);
			XSLFPowerPointExtractor extractor = new XSLFPowerPointExtractor(slide);
			text = extractor.getText();
			extractor.close();
		} catch (IOException e) {
			e.printStackTrace();
		}
		return text;
	}

	// 读取pdf文件全部内容 pdf
	private static String getTextFormPDF(byte[] file) {
		String text = "";
		PDDocument pdfdoc = null;
		InputStream is = null;
		try {
			is = new ByteArrayInputStream(file);
			pdfdoc = PDDocument.load(is);
			PDFTextStripper stripper = new PDFTextStripper();
			text = stripper.getText(pdfdoc);

		} catch (IOException e) {
			e.printStackTrace();
		} finally {
			try {
				if (pdfdoc != null) {
					pdfdoc.close();
				}
			} catch (IOException e) {
				// TODO Auto-generated catch block
				e.printStackTrace();
			}
		}
		return text;
	}

	// 读取txt文件全部内容 txt
	private static String getTextFormTxt(byte[] file) {
		String text = "";
		try {
			String encoding = get_charset(file);
			text = new String(file, encoding);
		} catch (UnsupportedEncodingException e) {
			e.printStackTrace();
		} catch (IOException e1) {
			e1.printStackTrace();
		}
		return text;
	}

	// 获得txt文件编码方式
	private static String get_charset(byte[] file) throws IOException {
		String charset = "GBK";
		byte[] first3Bytes = new byte[3];
		InputStream bis = null;
		try {
			boolean checked = false;
			bis = new ByteArrayInputStream(file);
			bis.mark(0);
			int read = bis.read(first3Bytes, 0, 3);
			if (read == -1)
				return charset;
			if (first3Bytes[0] == (byte) 0xFF && first3Bytes[1] == (byte) 0xFE) {
				charset = "UTF-16LE";
				checked = true;
			} else if (first3Bytes[0] == (byte) 0xFE && first3Bytes[1] == (byte) 0xFF) {
				charset = "UTF-16BE";
				checked = true;
			} else if (first3Bytes[0] == (byte) 0xEF && first3Bytes[1] == (byte) 0xBB
					&& first3Bytes[2] == (byte) 0xBF) {
				charset = "UTF-8";
				checked = true;
			}
			bis.reset();
			if (!checked) {
				while ((read = bis.read()) != -1) {
					if (read >= 0xF0)
						break;
					if (0x80 <= read && read <= 0xBF) // 单独出现BF以下的，也算是GBK
						break;
					if (0xC0 <= read && read <= 0xDF) {
						read = bis.read();
						if (0x80 <= read && read <= 0xBF) // 双字节 (0xC0 - 0xDF)
							// (0x80 - 0xBF),也可能在GB编码内
							continue;
						else
							break;
					} else if (0xE0 <= read && read <= 0xEF) {// 也有可能出错，但是几率较小
						read = bis.read();
						if (0x80 <= read && read <= 0xBF) {
							read = bis.read();
							if (0x80 <= read && read <= 0xBF) {
								charset = "UTF-8";
								break;
							} else
								break;
						} else
							break;
					}
				}
			}
		} catch (Exception e) {
			e.printStackTrace();
		} finally {
			if (bis != null) {
				bis.close();
			}
		}
		return charset;
	}
}










 
 
  
  
	// 读取pdf文件
	private static String getTextFormPDF(byte[] file) {
		String text = "";
		PDDocument pdfdoc = null;
		InputStream is = null;
		try {
			is = new ByteArrayInputStream(file);
			pdfdoc = PDDocument.load(is);
			PDFTextStripper stripper = new PDFTextStripper();
			text = stripper.getText(pdfdoc);

		} catch (IOException e) {
			e.printStackTrace();
		} finally {
			try {
				if (pdfdoc != null) {
					pdfdoc.close();
				}
			} catch (IOException e) {
				// TODO Auto-generated catch block
				e.printStackTrace();
			}
		}
		return text;
	}

	// 读取txt文件
	private static String getTextFormTxt(byte[] file) {
		String text = "";
		try {
			String encoding = get_charset(file);
			text = new String(file, encoding);
		} catch (UnsupportedEncodingException e) {
			e.printStackTrace();
		} catch (IOException e1) {
			e1.printStackTrace();
		}
		return text;
	}

wooden_fish丶

关注

2
点赞
踩
19

收藏

觉得还不错? 一键收藏
2
评论
JAVA提取Word,Excel,PPT,PDF,TXT等文档文字内容

首先引入Maven库 org.apache.poi poi 3.15 org.apache.poi poi-ooxml 3.15 org.apache.poi poi-scratchpad 3.15 org.apache.pdfbox pdfbox 2.0.4 public
复制链接

扫一扫