java解析pdf,word,ppt,excel

用pdfbox的jar包来解析pdf:

import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.OutputStreamWriter;

import org.pdfbox.pdmodel.PDDocument;
import org.pdfbox.util.PDFTextStripper;

public class Pdf2text {
public static String getTxt(File f) throws Exception {
String ts="";
try{
String temp = "";
PDDocument pdfdocument = PDDocument.load(f);

ByteArrayOutputStream out = new ByteArrayOutputStream();
OutputStreamWriter writer = new OutputStreamWriter(out);
PDFTextStripper stripper = new PDFTextStripper();

stripper.writeText(pdfdocument.getDocument(), writer);

pdfdocument.close();
out.close();
writer.close();
byte[] contents = out.toByteArray();
ts = new String(contents);
System.out.println(f.getName() + "length is:" + contents.length + "\n");
}catch(Exception e){
e.printStackTrace();
}
finally{
return ts;
}
}

public static void main(String[] args){
File file = new File("E:/600536_2008_zzy.pdf");
try {
System.out.println(Pdf2text.getTxt(file));
} catch (Exception e) {
// TODO 自动生成 catch 块
e.printStackTrace();
}
}
}
======================

word,excel和ppt都用POI的jar包来解析:

import java.io.File;

import org.apache.poi.POITextExtractor;
import org.apache.poi.extractor.ExtractorFactory;

public class DocxParser {

/**
* @param args
*/
public static void main(String[] args) {
try {
File inputFile = new File("D:\\test.docx");
//File inputFile = new File("D:\\test.pptx");
//File inputFile = new File("D:\\test.xlsx");
//File inputFile = new File("D:\\test.xls");
//File inputFile = new File("D:\\test.doc");
//File inputFile = new File("D:\\test.ppt");
POITextExtractor extractor = ExtractorFactory
.createExtractor(inputFile);
System.out.println("Document Text: ");
System.out.println("====================");
System.out.println(extractor.getText());
System.out.println("====================");
} catch (Exception ex) {
ex.printStackTrace();
}
}

}

import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;

import org.apache.poi.hwpf.extractor.WordExtractor;

public class Word2text {

public static void main(String[] args) {
File file = new File("E:\\2009.doc");
try {
FileInputStream fis = new FileInputStream(file);
WordExtractor wordExtractor = new WordExtractor(fis);
System.out.println("【 使用getText()方法提取的Word文件的内容如下所示:】");
System.out.println(wordExtractor.getText());
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
}

import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.InputStream;

import org.apache.poi.hslf.HSLFSlideShow;
import org.apache.poi.hslf.model.TextRun;
import org.apache.poi.hslf.model.Slide;
import org.apache.poi.hslf.usermodel.SlideShow;

public class Ppt2text {

/**
* @param args
* @throws FileNotFoundException
*/
public static void main(String[] args) throws FileNotFoundException {
File file = new File("E:\\1025681983.ppt");
InputStream fis = new FileInputStream(file);
try {
getDocument(fis);

} catch (Exception e) {

e.printStackTrace();
}
}

public static void getDocument(InputStream is) throws Exception {
StringBuffer content = new StringBuffer("");
try {
SlideShow ss = new SlideShow(new HSLFSlideShow(is));// is
// 为文件的InputStream,建立SlideShow
Slide[] slides = ss.getSlides();// 获得每一张幻灯片
for (int i = 0; i < slides.length; i++) {
TextRun[] t = slides[i].getTextRuns();// 为了取得幻灯片的文字内容,建立TextRun
for (int j = 0; j < t.length; j++) {
content.append(t[j].getText());// 这里会将文字内容加到content中去
}
content.append(slides[i].getTitle());
}
String str = new String(content);
System.out.println(str.toString());

} catch (Exception ex) {
System.out.println(ex.toString());
}

}

}
=============

对excel的解析也可以用jxl的jar包来解析:

import java.io.File;

import jxl.Cell;
import jxl.CellType;
import jxl.DateCell;
import jxl.NumberCell;
import jxl.Sheet;
import jxl.Workbook;


public class Excel2text {
public static void main(String args[]) {

try {

Workbook workbook = null;

try {
workbook = Workbook.getWorkbook(new File("e:\\Dealerlist_3.xls"));
} catch (Exception e) {
throw new Exception("file to import not found!");
}

Sheet sheet = workbook.getSheet(0);
Cell cell = null;

int columnCount = 3;
int rowCount = sheet.getRows();
for (int i = 0; i < rowCount; i++) {
for (int j = 0; j < columnCount; j++) {
// 注意,这里的两个参数,第一个是表示列的,第二才表示行
cell = sheet.getCell(j, i);
// 要根据单元格的类型分别做处理,否则格式化过的内容可能会不正确
if (cell.getType() == CellType.NUMBER) {
System.out.print(((NumberCell) cell).getValue());
} else if (cell.getType() == CellType.DATE) {
System.out.print(((DateCell) cell).getDate());
} else {
System.out.print(cell.getContents());
}

// System.out.print(cell.getContents());
System.out.print("\t");
}
System.out.print("\n");
}
// 关闭它,否则会有内存泄露
workbook.close();
} catch (Exception e) {

}

}
}

import java.io.*;
import jxl.*;
import jxl.write.*;
import jxl.format.*;

public class Text2Excel {
public static void main(String args[]) {

try {

File tempFile = new File("e:" + java.io.File.separator
+ "output00.xls");
System.out.println("e:" + java.io.File.separator + "output00.xls");

WritableWorkbook workbook = Workbook.createWorkbook(tempFile);
WritableSheet sheet = workbook.createSheet("TestCreateExcel", 0);

// 一些临时变量,用于写到excel中
Label l = null;
jxl.write.Number n = null;
jxl.write.DateTime d = null;

// 预定义的一些字体和格式,同一个Excel中最好不要有太多格式
WritableFont headerFont = new WritableFont(WritableFont.ARIAL, 12,
WritableFont.BOLD, false, UnderlineStyle.NO_UNDERLINE,
jxl.format.Colour.BLUE);
WritableCellFormat headerFormat = new WritableCellFormat(headerFont);

WritableFont titleFont = new WritableFont(WritableFont.ARIAL, 10,
WritableFont.NO_BOLD, false, UnderlineStyle.NO_UNDERLINE,
jxl.format.Colour.RED);
WritableCellFormat titleFormat = new WritableCellFormat(titleFont);

WritableFont detFont = new WritableFont(WritableFont.ARIAL, 10,
WritableFont.NO_BOLD, false, UnderlineStyle.NO_UNDERLINE,
jxl.format.Colour.BLACK);
WritableCellFormat detFormat = new WritableCellFormat(detFont);

NumberFormat nf = new NumberFormat("0.00000"); // 用于Number的格式
WritableCellFormat priceFormat = new WritableCellFormat(detFont, nf);

DateFormat df = new DateFormat("yyyy-MM-dd");// 用于日期的
WritableCellFormat dateFormat = new WritableCellFormat(detFont, df);

// 剩下的事情,就是用上面的内容和格式创建一些单元格,再加到sheet中
l = new Label(0, 0, "用于测试的Excel文件", headerFormat);
sheet.addCell(l);

// add Title
int column = 0;
l = new Label(column++, 2, "标题", titleFormat);
sheet.addCell(l);
l = new Label(column++, 2, "日期", titleFormat);
sheet.addCell(l);
l = new Label(column++, 2, "货币", titleFormat);
sheet.addCell(l);
l = new Label(column++, 2, "价格", titleFormat);
sheet.addCell(l);

// add detail
int i = 0;
column = 0;
l = new Label(column++, i + 3, "标题 " + i, detFormat);
sheet.addCell(l);
d = new DateTime(column++, i + 3, new java.util.Date(), dateFormat);
sheet.addCell(d);
l = new Label(column++, i + 3, "CNY", detFormat);
sheet.addCell(l);
n = new jxl.write.Number(column++, i + 3, 5.678, priceFormat);
sheet.addCell(n);

i++;
column = 0;
l = new Label(column++, i + 3, "标题 " + i, detFormat);
sheet.addCell(l);
d = new DateTime(column++, i + 3, new java.util.Date(), dateFormat);
sheet.addCell(d);
l = new Label(column++, i + 3, "SGD", detFormat);
sheet.addCell(l);
n = new jxl.write.Number(column++, i + 3, 98832, priceFormat);
sheet.addCell(n);

// 设置列的宽度
column = 0;
sheet.setColumnView(column++, 20);
sheet.setColumnView(column++, 20);
sheet.setColumnView(column++, 10);
sheet.setColumnView(column++, 20);

workbook.write();
workbook.close();
} catch (Exception e) {
e.printStackTrace();
}

}
}
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值