JAVA解析文件pdf、word、excel

pdfbox的jar包来解析pdf:

import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.OutputStreamWriter;

import org.pdfbox.pdmodel.PDDocument;
import org.pdfbox.util.PDFTextStripper;

public class Pdf2text {  
    public static String getTxt(File f) throws Exception {    
        String ts="";    
        try{    
            String temp = "";    
            PDDocument pdfdocument = PDDocument.load(f);
            
            ByteArrayOutputStream out = new ByteArrayOutputStream();    
            OutputStreamWriter writer = new OutputStreamWriter(out);    
            PDFTextStripper stripper = new PDFTextStripper();
            
            stripper.writeText(pdfdocument.getDocument(), writer);    
            
            pdfdocument.close();    
            out.close();    
            writer.close();    
            byte[] contents = out.toByteArray();    
            ts = new String(contents);    
            System.out.println(f.getName() + "length is:" + contents.length + "\n");    
        }catch(Exception e){    
            e.printStackTrace();    
        }    
        finally{    
            return ts;    
        }    
    }  
      
    public static void main(String[] args){  
        File file = new File("E:/600536_2008_zzy.pdf");  
        try {  
            System.out.println(Pdf2text.getTxt(file));  
        } catch (Exception e) {  
            // TODO 自动生成 catch 块  
            e.printStackTrace();  
        }  
    }  
}
======================

word,excel和ppt都用POI的jar包来解析:

  1. import java.io.File;  
  2.   
  3. import org.apache.poi.POITextExtractor;  
  4. import org.apache.poi.extractor.ExtractorFactory;  
  5.   
  6. public class DocxParser {  
  7.   
  8.     /**
  9.       * @param args
  10.       */  
  11.     public static void main(String[] args) {  
  12.         try {  
  13.              File inputFile = new File("D:\\test.docx");  
  14.             //File inputFile = new File("D:\\test.pptx");   
  15.             //File inputFile = new File("D:\\test.xlsx");   
  16.             //File inputFile = new File("D:\\test.xls");   
  17.             //File inputFile = new File("D:\\test.doc");   
  18.             //File inputFile = new File("D:\\test.ppt");   
  19.              POITextExtractor extractor = ExtractorFactory  
  20.                      .createExtractor(inputFile);  
  21.              System.out.println("Document Text: ");  
  22.              System.out.println("====================");  
  23.              System.out.println(extractor.getText());  
  24.              System.out.println("====================");  
  25.          } catch (Exception ex) {  
  26.              ex.printStackTrace();  
  27.          }  
  28.      }  
  29.   
  30. }  

import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;

import org.apache.poi.hwpf.extractor.WordExtractor;

public class Word2text {

public static void main(String[] args) {
   File file = new File("E:\\2009.doc");
   try {
    FileInputStream fis = new FileInputStream(file);
    WordExtractor wordExtractor = new WordExtractor(fis);
    System.out.println("【 使用getText()方法提取的Word文件的内容如下所示:】");
    System.out.println(wordExtractor.getText());
   } catch (FileNotFoundException e) {
    e.printStackTrace();
   } catch (IOException e) {
    e.printStackTrace();
   }
}
}

import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.InputStream;

import org.apache.poi.hslf.HSLFSlideShow;
import org.apache.poi.hslf.model.TextRun;
import org.apache.poi.hslf.model.Slide;
import org.apache.poi.hslf.usermodel.SlideShow;

public class Ppt2text {

/**
* @param args
* @throws FileNotFoundException
*/
public static void main(String[] args) throws FileNotFoundException {
   File file = new File("E:\\1025681983.ppt");
   InputStream fis = new FileInputStream(file);
   try {
    getDocument(fis);
  
   } catch (Exception e) {
  
    e.printStackTrace();
   }
}

public static void getDocument(InputStream is) throws Exception {
   StringBuffer content = new StringBuffer("");
   try {
    SlideShow ss = new SlideShow(new HSLFSlideShow(is));// is
                 // 为文件的InputStream,建立SlideShow
    Slide[] slides = ss.getSlides();// 获得每一张幻灯片
    for (int i = 0; i < slides.length; i++) {
     TextRun[] t = slides[i].getTextRuns();// 为了取得幻灯片的文字内容,建立TextRun
     for (int j = 0; j < t.length; j++) {
      content.append(t[j].getText());// 这里会将文字内容加到content中去
     }
     content.append(slides[i].getTitle());
    }
    String str = new String(content);
    System.out.println(str.toString());
  
   } catch (Exception ex) {
    System.out.println(ex.toString());
   }
  
}

}
=============

对excel的解析也可以用jxl的jar包来解析:

import java.io.File;

import jxl.Cell;
import jxl.CellType;
import jxl.DateCell;
import jxl.NumberCell;
import jxl.Sheet;
import jxl.Workbook;


public class Excel2text {
public static void main(String args[]) {

   try {

    Workbook workbook = null;

    try {
     workbook = Workbook.getWorkbook(new File("e:\\Dealerlist_3.xls"));
    } catch (Exception e) {
     throw new Exception("file to import not found!");
    }

    Sheet sheet = workbook.getSheet(0);
    Cell cell = null;

    int columnCount = 3;
    int rowCount = sheet.getRows();
    for (int i = 0; i < rowCount; i++) {
     for (int j = 0; j < columnCount; j++) {
      // 注意,这里的两个参数,第一个是表示列的,第二才表示行
      cell = sheet.getCell(j, i);
      // 要根据单元格的类型分别做处理,否则格式化过的内容可能会不正确
      if (cell.getType() == CellType.NUMBER) {
       System.out.print(((NumberCell) cell).getValue());
      } else if (cell.getType() == CellType.DATE) {
       System.out.print(((DateCell) cell).getDate());
      } else {
       System.out.print(cell.getContents());
      }

      // System.out.print(cell.getContents());
      System.out.print("\t");
     }
     System.out.print("\n");
    }
    // 关闭它,否则会有内存泄露
    workbook.close();
   } catch (Exception e) {

   }

}
}

import java.io.*;
import jxl.*;
import jxl.write.*;
import jxl.format.*;

public class Text2Excel {
public static void main(String args[]) {

   try {

    File tempFile = new File("e:" + java.io.File.separator
      + "output00.xls");
    System.out.println("e:" + java.io.File.separator + "output00.xls");

    WritableWorkbook workbook = Workbook.createWorkbook(tempFile);
    WritableSheet sheet = workbook.createSheet("TestCreateExcel", 0);

    // 一些临时变量,用于写到excel中
    Label l = null;
    jxl.write.Number n = null;
    jxl.write.DateTime d = null;

    // 预定义的一些字体和格式,同一个Excel中最好不要有太多格式
    WritableFont headerFont = new WritableFont(WritableFont.ARIAL, 12,
      WritableFont.BOLD, false, UnderlineStyle.NO_UNDERLINE,
      jxl.format.Colour.BLUE);
    WritableCellFormat headerFormat = new WritableCellFormat(headerFont);

    WritableFont titleFont = new WritableFont(WritableFont.ARIAL, 10,
      WritableFont.NO_BOLD, false, UnderlineStyle.NO_UNDERLINE,
      jxl.format.Colour.RED);
    WritableCellFormat titleFormat = new WritableCellFormat(titleFont);

    WritableFont detFont = new WritableFont(WritableFont.ARIAL, 10,
      WritableFont.NO_BOLD, false, UnderlineStyle.NO_UNDERLINE,
      jxl.format.Colour.BLACK);
    WritableCellFormat detFormat = new WritableCellFormat(detFont);

    NumberFormat nf = new NumberFormat("0.00000"); // 用于Number的格式
    WritableCellFormat priceFormat = new WritableCellFormat(detFont, nf);

    DateFormat df = new DateFormat("yyyy-MM-dd");// 用于日期的
    WritableCellFormat dateFormat = new WritableCellFormat(detFont, df);

    // 剩下的事情,就是用上面的内容和格式创建一些单元格,再加到sheet中
    l = new Label(0, 0, "用于测试的Excel文件", headerFormat);
    sheet.addCell(l);

    // add Title
    int column = 0;
    l = new Label(column++, 2, "标题", titleFormat);
    sheet.addCell(l);
    l = new Label(column++, 2, "日期", titleFormat);
    sheet.addCell(l);
    l = new Label(column++, 2, "货币", titleFormat);
    sheet.addCell(l);
    l = new Label(column++, 2, "价格", titleFormat);
    sheet.addCell(l);

    // add detail
    int i = 0;
    column = 0;
    l = new Label(column++, i + 3, "标题 " + i, detFormat);
    sheet.addCell(l);
    d = new DateTime(column++, i + 3, new java.util.Date(), dateFormat);
    sheet.addCell(d);
    l = new Label(column++, i + 3, "CNY", detFormat);
    sheet.addCell(l);
    n = new jxl.write.Number(column++, i + 3, 5.678, priceFormat);
    sheet.addCell(n);

    i++;
    column = 0;
    l = new Label(column++, i + 3, "标题 " + i, detFormat);
    sheet.addCell(l);
    d = new DateTime(column++, i + 3, new java.util.Date(), dateFormat);
    sheet.addCell(d);
    l = new Label(column++, i + 3, "SGD", detFormat);
    sheet.addCell(l);
    n = new jxl.write.Number(column++, i + 3, 98832, priceFormat);
    sheet.addCell(n);

    // 设置列的宽度
    column = 0;
    sheet.setColumnView(column++, 20);
    sheet.setColumnView(column++, 20);
    sheet.setColumnView(column++, 10);
    sheet.setColumnView(column++, 20);

    workbook.write();
    workbook.close();
   } catch (Exception e) {
            e.printStackTrace();
   }

}
}


  • 0
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值