做之前,要下载pdfbox和poi插件,网上很多,很容找到,代码demo如下
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStreamReader;
import org.apache.log4j.Logger;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.hssf.usermodel.HSSFSheet;
import org.apache.poi.hssf.usermodel.HSSFRow;
import org.apache.poi.hssf.usermodel.HSSFCell;
import org.apache.poi.hslf.HSLFSlideShow;
import org.apache.poi.hslf.model.TextRun;
import org.apache.poi.hslf.model.Slide;
import org.apache.poi.hslf.usermodel.SlideShow;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.usermodel.Paragraph;
import org.apache.poi.hwpf.usermodel.Range;
import org.pdfbox.cos.COSDocument;
import org.pdfbox.pdfparser.PDFParser;
import org.pdfbox.pdmodel.PDDocument;
import org.pdfbox.util.PDFTextStripper;
/**
* 读取各种文件的类
* 作用:用于读取各种文件的内容
* @author sang
*
*/
public class ReadFile {
private static final Logger log=Logger.getLogger(ReadFile.class);
public static void main(String args[]){
String path="D:\\temp\\106-113.p3.pdf";
String content=readPdf(path);
System.out.println(content);
System.out.println("****************************************************************************************");
log.info(content);
}
/* @author sang
* 读 word方法
* input param path 是文件路径
* output param content 读取的文件内容
*/
public static String readWord(String path) {
StringBuffer content = new StringBuffer("");// 文档内容
try {
HWPFDocument doc = new HWPFDocument(new FileInputStream(path));
Range range = doc.getRange();
int paragraphCount = range.numParagraphs();// 段落
for (int i = 0; i < paragraphCount; i++) {// 遍历段落读取数据
Paragraph pp = range.getParagraph(i);
content.append(pp.text());
}
} catch (Exception e) {
}
return content.toString().trim();
}
/* @author sang
* 读 pdf方法
* input param path 是文件路径
* output param content 读取的文件内容
*/
public static String readPdf(String path) {
StringBuffer content = new StringBuffer("");// 文档内容
FileInputStream fis;
try {
fis = new FileInputStream(path);
PDFParser p = new PDFParser(fis);
p.parse();
PDFTextStripper ts = new PDFTextStripper();
//org.fontbox.cmap.CMapParser
content.append(ts.getText(p.getPDDocument()));
p.getPDDocument().close();
fis.close();
} catch (Exception e) {
System.out.println("读取pdf文件出现异常");
e.printStackTrace();
}
return content.toString().trim();
}
/*
* 读取PDF 方法2
*/
public static String readpaffile(String path) {
String docText = null;
PDFParser parser;
try {
parser = new PDFParser(new FileInputStream(new File(path)));
parser.parse();
COSDocument cosdoc = parser.getDocument();
PDFTextStripper stripper = new PDFTextStripper();
docText = stripper.getText(new PDDocument(cosdoc));
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
return docText.trim();
}
/* @author sang
* 读html
* input param urlString 是文件路径
* output param contentString 读取的文件内容
*/
public static String readHtml(String urlString) {
StringBuffer content = new StringBuffer("");
File file = new File(urlString);
FileInputStream fis = null;
try {
fis = new FileInputStream(file);
// 读取页面
BufferedReader reader = new BufferedReader(new InputStreamReader(
fis, "GBK"));//这里的字符编码要注意,要对上html头文件的一致,否则会出乱码
String line = null;
while ((line = reader.readLine()) != null) {
content.append(line + "\n");
}
reader.close();
} catch (Exception e) {
System.out.println("读取html出现异常");
}
String contentString = content.toString();
return contentString;
}
/* @author sang
* 读txt
* input param path 是文件路径
* output param content 读取的文件内容
*/
public static String readTxt(String path) {
StringBuffer content = new StringBuffer("");// 文档内容
try {
FileReader reader = new FileReader(path);
BufferedReader br = new BufferedReader(reader);
String s1 = null;
while ((s1 = br.readLine()) != null) {
content.append(s1 + "\r");
}
br.close();
reader.close();
} catch (IOException e) {
System.out.println("读取txt出现异常");
}
return content.toString().trim();
}
/* @author sang
* 读ppt
* input param url 是文件路径
* output param content 读取的文件内容
*/
public static String ReadPPt(String url) throws Exception {
StringBuffer content = new StringBuffer("");
try {
SlideShow ss = new SlideShow(new HSLFSlideShow(new FileInputStream(
url)));//is 为文件的InputStream,建立SlideShow
Slide[] slides = ss.getSlides();//获得每一张幻灯片
for (int i = 0; i < slides.length; i++) {
TextRun[] t = slides[i].getTextRuns();//为了取得幻灯片的文字内容,建立TextRun
for (int j = 0; j < t.length; j++) {
content.append(t[j].getText());//这里会将文字内容加到content中去
}
content.append(slides[i].getTitle());
}
} catch (Exception ex) {
System.out.println("读取ppt异常:" + ex.toString());
}
return content.toString();
}
/* @author sang
* 读excel
* input param url 是文件路径
* output param content 读取的文件内容
*/
public static String ReadExcel(String url) throws Exception {
StringBuffer content = new StringBuffer();
try {
HSSFWorkbook workbook = new HSSFWorkbook(new FileInputStream(url));//创建对Excel工作簿文件的引用
for (int numSheets = 0; numSheets < workbook.getNumberOfSheets(); numSheets++) {
if (null != workbook.getSheetAt(numSheets)) {
HSSFSheet aSheet = workbook.getSheetAt(numSheets);//获得一个sheet
for (int rowNumOfSheet = 0; rowNumOfSheet <= aSheet
.getLastRowNum(); rowNumOfSheet++) {
if (null != aSheet.getRow(rowNumOfSheet)) {
HSSFRow aRow = aSheet.getRow(rowNumOfSheet); //获得一个行
for (int cellNumOfRow = 0; cellNumOfRow <= aRow
.getLastCellNum(); cellNumOfRow++) {
if (null != aRow.getCell(cellNumOfRow)) {
HSSFCell aCell = aRow.getCell(cellNumOfRow);//获得列值
content.append(aCell.getStringCellValue());
}
}
}
}
}
}
} catch (Exception e) {
System.out.println(" 读取excel异常 : " + e);
}
return content.toString();
}
}
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStreamReader;
import org.apache.log4j.Logger;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.hssf.usermodel.HSSFSheet;
import org.apache.poi.hssf.usermodel.HSSFRow;
import org.apache.poi.hssf.usermodel.HSSFCell;
import org.apache.poi.hslf.HSLFSlideShow;
import org.apache.poi.hslf.model.TextRun;
import org.apache.poi.hslf.model.Slide;
import org.apache.poi.hslf.usermodel.SlideShow;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.usermodel.Paragraph;
import org.apache.poi.hwpf.usermodel.Range;
import org.pdfbox.cos.COSDocument;
import org.pdfbox.pdfparser.PDFParser;
import org.pdfbox.pdmodel.PDDocument;
import org.pdfbox.util.PDFTextStripper;
/**
* 读取各种文件的类
* 作用:用于读取各种文件的内容
* @author sang
*
*/
public class ReadFile {
private static final Logger log=Logger.getLogger(ReadFile.class);
public static void main(String args[]){
String path="D:\\temp\\106-113.p3.pdf";
String content=readPdf(path);
System.out.println(content);
System.out.println("****************************************************************************************");
log.info(content);
}
/* @author sang
* 读 word方法
* input param path 是文件路径
* output param content 读取的文件内容
*/
public static String readWord(String path) {
StringBuffer content = new StringBuffer("");// 文档内容
try {
HWPFDocument doc = new HWPFDocument(new FileInputStream(path));
Range range = doc.getRange();
int paragraphCount = range.numParagraphs();// 段落
for (int i = 0; i < paragraphCount; i++) {// 遍历段落读取数据
Paragraph pp = range.getParagraph(i);
content.append(pp.text());
}
} catch (Exception e) {
}
return content.toString().trim();
}
/* @author sang
* 读 pdf方法
* input param path 是文件路径
* output param content 读取的文件内容
*/
public static String readPdf(String path) {
StringBuffer content = new StringBuffer("");// 文档内容
FileInputStream fis;
try {
fis = new FileInputStream(path);
PDFParser p = new PDFParser(fis);
p.parse();
PDFTextStripper ts = new PDFTextStripper();
//org.fontbox.cmap.CMapParser
content.append(ts.getText(p.getPDDocument()));
p.getPDDocument().close();
fis.close();
} catch (Exception e) {
System.out.println("读取pdf文件出现异常");
e.printStackTrace();
}
return content.toString().trim();
}
/*
* 读取PDF 方法2
*/
public static String readpaffile(String path) {
String docText = null;
PDFParser parser;
try {
parser = new PDFParser(new FileInputStream(new File(path)));
parser.parse();
COSDocument cosdoc = parser.getDocument();
PDFTextStripper stripper = new PDFTextStripper();
docText = stripper.getText(new PDDocument(cosdoc));
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
return docText.trim();
}
/* @author sang
* 读html
* input param urlString 是文件路径
* output param contentString 读取的文件内容
*/
public static String readHtml(String urlString) {
StringBuffer content = new StringBuffer("");
File file = new File(urlString);
FileInputStream fis = null;
try {
fis = new FileInputStream(file);
// 读取页面
BufferedReader reader = new BufferedReader(new InputStreamReader(
fis, "GBK"));//这里的字符编码要注意,要对上html头文件的一致,否则会出乱码
String line = null;
while ((line = reader.readLine()) != null) {
content.append(line + "\n");
}
reader.close();
} catch (Exception e) {
System.out.println("读取html出现异常");
}
String contentString = content.toString();
return contentString;
}
/* @author sang
* 读txt
* input param path 是文件路径
* output param content 读取的文件内容
*/
public static String readTxt(String path) {
StringBuffer content = new StringBuffer("");// 文档内容
try {
FileReader reader = new FileReader(path);
BufferedReader br = new BufferedReader(reader);
String s1 = null;
while ((s1 = br.readLine()) != null) {
content.append(s1 + "\r");
}
br.close();
reader.close();
} catch (IOException e) {
System.out.println("读取txt出现异常");
}
return content.toString().trim();
}
/* @author sang
* 读ppt
* input param url 是文件路径
* output param content 读取的文件内容
*/
public static String ReadPPt(String url) throws Exception {
StringBuffer content = new StringBuffer("");
try {
SlideShow ss = new SlideShow(new HSLFSlideShow(new FileInputStream(
url)));//is 为文件的InputStream,建立SlideShow
Slide[] slides = ss.getSlides();//获得每一张幻灯片
for (int i = 0; i < slides.length; i++) {
TextRun[] t = slides[i].getTextRuns();//为了取得幻灯片的文字内容,建立TextRun
for (int j = 0; j < t.length; j++) {
content.append(t[j].getText());//这里会将文字内容加到content中去
}
content.append(slides[i].getTitle());
}
} catch (Exception ex) {
System.out.println("读取ppt异常:" + ex.toString());
}
return content.toString();
}
/* @author sang
* 读excel
* input param url 是文件路径
* output param content 读取的文件内容
*/
public static String ReadExcel(String url) throws Exception {
StringBuffer content = new StringBuffer();
try {
HSSFWorkbook workbook = new HSSFWorkbook(new FileInputStream(url));//创建对Excel工作簿文件的引用
for (int numSheets = 0; numSheets < workbook.getNumberOfSheets(); numSheets++) {
if (null != workbook.getSheetAt(numSheets)) {
HSSFSheet aSheet = workbook.getSheetAt(numSheets);//获得一个sheet
for (int rowNumOfSheet = 0; rowNumOfSheet <= aSheet
.getLastRowNum(); rowNumOfSheet++) {
if (null != aSheet.getRow(rowNumOfSheet)) {
HSSFRow aRow = aSheet.getRow(rowNumOfSheet); //获得一个行
for (int cellNumOfRow = 0; cellNumOfRow <= aRow
.getLastCellNum(); cellNumOfRow++) {
if (null != aRow.getCell(cellNumOfRow)) {
HSSFCell aCell = aRow.getCell(cellNumOfRow);//获得列值
content.append(aCell.getStringCellValue());
}
}
}
}
}
}
} catch (Exception e) {
System.out.println(" 读取excel异常 : " + e);
}
return content.toString();
}
}