packagecom.ducklyl;importjava.io.File;importjava.io.FileInputStream;importjava.util.Iterator;importorg.apache.poi.hslf.model.Slide;importorg.apache.poi.hslf.model.TextRun;importorg.apache.poi.hslf.usermodel.SlideShow;importorg.apache.poi.hssf.usermodel.HSSFCell;importorg.apache.poi.hssf.usermodel.HSSFRow;importorg.apache.poi.hssf.usermodel.HSSFSheet;importorg.apache.poi.hssf.usermodel.HSSFWorkbook;importorg.apache.poi.hwpf.HWPFDocument;importorg.apache.poi.hwpf.usermodel.Paragraph;importorg.apache.poi.hwpf.usermodel.Range;importorg.dom4j.Document;importorg.dom4j.Element;importorg.dom4j.io.SAXReader;importorg.htmlparser.Parser;importorg.htmlparser.filters.*;importorg.htmlparser.*;importorg.htmlparser.nodes.TextNode;importorg.htmlparser.util.*;importorg.pdfbox.pdfparser.PDFParser;importorg.pdfbox.pdmodel.PDDocument;importorg.pdfbox.util.PDFTextStripper;publicclassHandleFile {publicstaticvoidmain(String args[]){
String str="e:\\test.HTML";
System.out.println(handleFile(str));
}publicstaticString handleFile(String filename){
String result="";
String fileType=filename.substring(filename.lastIndexOf(".")+1, filename.length());if(fileType.equalsIgnoreCase("pdf"))
result=handlePdf(filename);elseif(fileType.equalsIgnoreCase("xls"))
result=handleExcel(filename);elseif(fileType.equalsIgnoreCase("doc"))
result=handleDoc(filename);elseif(fileType.equalsIgnoreCase("xml"))
result=handleXml(filename);elseif(fileType.equalsIgnoreCase("ppt"))
result=handlePPT(filename);elseif(fileType.equalsIgnoreCase("htm")||fileType.equalsIgnoreCase("html"))
result=handleHtml(filename);returnresult;
}/*** 解析HTML
*@paramfilename
*@return*/publicstaticString handleHtml(String filename){
String content="";try{
File file=newFile(filename);if(!file.exists())returncontent;
Parser parser=newParser(filename);
parser.setEncoding("UTF-8");
NodeFilter textFilter=newNodeClassFilter(TextNode.class);
NodeList nodes=parser.extractAllNodesThatMatch(textFilter);for(inti=0;i
TextNode textnode=(TextNode)nodes.elementAt(i);
String line=textnode.toPlainTextString().trim();if(line.equals(""))continue;
content=content+line;
}
}catch(Exception e){
e.printStackTrace();
}returncontent;
}/*** 解析PPT
*@paramfilename
*@return*/publicstaticString handlePPT(String filename){
StringBuffer content=newStringBuffer("");try{
File file=newFile(filename);if(!file.exists()) {returncontent.toString();
}
FileInputStream instream=newFileInputStream(file);
SlideShow ppt=newSlideShow(instream);
Slide[] slides=ppt.getSlides();for(inti=0;i
TextRun[] t=slides[i].getTextRuns();//为了取得幻灯片的文字内容,建立TextRunfor(intj=0;j
content.append(t[j].getText());//这里会将文字内容加到content中去}
content.append(slides[i].getTitle());
}
}catch(Exception e){
e.printStackTrace();
}returncontent.toString();
}/*** 解析XML
*@paramfilename
*@return*/publicstaticString handleXml(String filename){
String content="",value="",text="";try{
File file=newFile(filename);if(!file.exists()) {returncontent;
}
SAXReader saxReader=newSAXReader();
Document document=saxReader.read(file);
Element root=document.getRootElement() ;
Iterator iter=root.elementIterator() ;while(iter.hasNext()){
Element element=(Element)iter.next();
value=element.getStringValue();if(!value.trim().equals("")) content=content+value;
}
}catch(Exception e){
e.printStackTrace();
}returncontent;
}/*** 解析DOC
*@paramfilename
*@return*/publicstaticString handleDoc(String filename){
String content="";try{
File file=newFile(filename);if(!file.exists()) {returncontent;
}
FileInputStream instream=newFileInputStream(file);
HWPFDocument doc=newHWPFDocument(instream);
Range range=doc.getRange();
String text=range.text();for(inti=0;i
Paragraph p=range.getParagraph(i);
content=content+p.text().trim()+"\n";
}
}catch(Exception e){
e.printStackTrace();
}returncontent;
}/*** 解析PDF
*@paramfilename
*@return*/publicstaticString handlePdf(String filename){
String contenttxt="";try{
File file=newFile(filename);if(!file.exists()){returncontenttxt;
}
FileInputStream instream=newFileInputStream(file);
PDFParser parser=newPDFParser(instream);
parser.parse();
PDDocument pdfdocument=parser.getPDDocument();
PDFTextStripper pdfstripper=newPDFTextStripper();
contenttxt=pdfstripper.getText(pdfdocument);
}catch(Exception e){
e.printStackTrace();
}returncontenttxt;
}/*** 解析EXCEL
*@paramfilename
*@return*/publicstaticString handleExcel(String filename){
String content="";try{
File file=newFile(filename);if(!file.exists()) {returncontent;
}
HSSFWorkbook workbook=newHSSFWorkbook(newFileInputStream(file));
HSSFSheet sheet=workbook.getSheetAt(0);for(inti=0;i
sheet=workbook.getSheetAt(i);if(sheet!=null){for(intm=0;m
HSSFRow row=sheet.getRow(m);if(row==null)break;for(intn=0;n
HSSFCell cell=row.getCell(n);if(cell==null)break;inttype=cell.getCellType();switch(type){case0:
content=content+cell.getNumericCellValue();break;case1:
content=content+cell.getStringCellValue();break;case3:break;default:
;
}
}
content=content+"\n";
}
}
content=content+"\n";
}
}catch(Exception e){
e.printStackTrace();
}returncontent;
}
}