基于tm-extractor的Word文档内容搜索软件开发

对于Javaword解析,主流方法是用中间件去访问,但是需要用JNI.

Apache和微软合作,开发的POI,但是POI虽然功能强大,但是用起来也很麻烦

Java word解析, 抛弃使用 中间件 繁琐的方法 直接使用 tm-extractor 对word进行解析检索。

 

tm-extractors0.4版本不需要apachepoi包,因为里面已经整合了

tm-extractors1.0版本需要poi-3.0.1-FINAL-20070705.jar,版本一定要正确

tm-extractors官方网站

http://www.textmining.org/

废话少说,直接上图和代码:

 

package docSearch;

import java.io.File;
import java.util.ArrayList;

public class FileSearch {
    public ArrayList<String> filelist = new ArrayList<String>();
   
 
    public void setFileList(String strPath) {
        File dir = new File(strPath);
        File[] files = dir.listFiles();
       
//        SearchUI sui = new SearchUI();
//        Boolean docselect = sui.getdocCheckBox();
//        Boolean txtselect = sui.gettxtCheckBox();
//        Boolean javaselect = sui.getjavaCheckBox();
          
        if (files == null)   
            return;   
        for (int i = 0; i < files.length; i++) {   
            if (files[i].isDirectory()) {   
                setFileList(files[i].getAbsolutePath());   
            } else {   
                String strFileName = files[i].getAbsolutePath().toLowerCase();
                if((strFileName.endsWith(".doc"))&&
                  ((!files[i].getName().startsWith("~$"))||(!(files[i].length()<500)))){
                 //System.out.println("---"+strFileName+files[i].length());  
                    filelist.add(files[i].getAbsolutePath());
                }
//                else if(strFileName.endsWith(".java")){
//                 filelist.add(files[i].getAbsolutePath());
//                }
//                else if(strFileName.endsWith(".txt")){
//                 filelist.add(files[i].getAbsolutePath());
//                }
                else{
                 
                }
            }
        }
    }
    public ArrayList<String> getFilePath(){

     return filelist;
    }

}

package docSearch;

import java.io.File;
import java.util.ArrayList;
import java.util.StringTokenizer;

public class FileScan {
 public ArrayList<String> keyWord = new ArrayList<String>();
 public ArrayList<String> FilePathList = new ArrayList<String>();
 
 
 public void setFilePathList(ArrayList<String> filePathList){
  this.FilePathList = filePathList;
 }
 
 public void setKeyWord(String Keyword){
/*  StringTokenizer st = new StringTokenizer(Keyword);
      while (st.hasMoreTokens()) {
       keyWord.add(st.nextToken());
      }*/
  keyWord.add(Keyword);
 }
 
 /*
  */
 public Boolean scanFile(String fileText){

  String kw = null;
  int length = keyWord.size();
  for(int i=0;i<length;i++){
   kw = keyWord.get(i);
   //System.out.println(kw);
   //if(fileText.indexOf(kw)>=0)return true;
   if(FindSubString( fileText, kw)!=-1) return true;
  }
  return false;

 }
 
 public String getFileText(){

  String fileText = null;
  return fileText;
 }
 
 public int FindSubString(String fileText,String keyword){
  
  if ((fileText.equals("")) || (keyword.equals(""))){
   return -1;
  }
  int i = 0;
  int j = 0;
  int nLenMain = fileText.length();
  int nLenSub = keyword.length();

  if (nLenSub > nLenMain){
   return -1;
  }
  while ((i < nLenMain) && (j < nLenSub)){
   if (fileText.charAt(i) == keyword.charAt(j)){
    ++i;
    ++j;
   }
   else{
    i = i - j + 1;
    j = 0;
   }
  }
  if (j == nLenSub){
   return i - j;
  }
  else{
   return -1;
  }
 }
}

package docSearch;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;

import javax.swing.JOptionPane;

import org.textmining.extraction.word.WordTextExtractor;
import org.textmining.extraction.word.WordTextExtractorFactory;


public class javaReadDoc {
 
 public ArrayList<String> failOpenList = new ArrayList<String>();   
 
 public String readDoc(String docpath){
  
  
  String text = null;
  
  if(docpath.endsWith(".doc")){   
   FileInputStream in = null;
   try {
    in = new FileInputStream(new File(docpath));
   } catch (FileNotFoundException e) {
    // TODO Auto-generated catch block
    System.out.println("新建"+docpath+"文件数据流失败!");
    e.printStackTrace();
   }

   WordTextExtractorFactory extractor = new WordTextExtractorFactory();

   
   try {
    WordTextExtractor ex = (WordTextExtractor) extractor.textExtractor(in);
    text = ex.getText();
    //text = extractor.textExtractor(in).getText();
   } catch (IOException e) {
    // TODO Auto-generated catch block
    System.out.println("读入"+docpath+"文件失败!");
    e.printStackTrace();
   } catch (Exception e) {
    // TODO Auto-generated catch block
    //JOptionPane.showMessageDialog(null,"读取 "+docpath+" 文件失败!/n按确定跳过该文件,继续搜索。");
    failOpenList.add(docpath);
    //System.out.println(docpath+"文件操作失败!");
    return "的";
    //e.printStackTrace();
   }

   
  }
  else{
   try {
    BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(docpath)));  
     String data = null;  
     while((data = br.readLine())!=null)  
     {  
      text=text+data; 
     }
    
   } catch (IOException e) {
    // TODO 自动生成 catch 块
    failOpenList.add(docpath);
    return "的";
   }
  }
  return text;

 }
 
 public ArrayList<String> getFailOpenFile(){
  return this.failOpenList;
  
 }

}

package docSearch;

import java.awt.BorderLayout;
import java.awt.Color;
import java.awt.Container;
import java.awt.Dimension;
import java.awt.FlowLayout;
import java.awt.Toolkit;
import java.awt.datatransfer.Clipboard;
import java.awt.datatransfer.StringSelection;
import java.awt.event.ActionEvent;
import java.awt.event.ActionListener;
import java.awt.event.MouseAdapter;
import java.awt.event.MouseEvent;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Date;

import javax.swing.*;


public class SearchUI{
 
 private JButton file = new JButton("选择文件夹");
 private JLabel label = new JLabel(" 关键字:");
 private JTextField keyWord = new JTextField("",26);//在这里输入关键字,多个关键字用空格隔开
 private JCheckBox docFile = new JCheckBox(".doc");
 private JCheckBox javaFile = new JCheckBox(".java");
 private JCheckBox txtFile = new JCheckBox(".txt");
 private JPopupMenu   popup;
 private String jtabelFilePath= null;
 private final int JTabelColumnNum = 180;
 
 final Object[] columnNames = {"文件名", "文件路径", "文件大小","修改日期"};
 Object[][] rowData = new Object[JTabelColumnNum][4];
 private JTable FileList = new JTable (rowData,columnNames);
 private int choiceRow = -1;

 
 public void showUI(){
  
  popup = new JPopupMenu();  
  FileList.setPreferredScrollableViewportSize(new Dimension(600, 100));//设置表格的大小
  FileList.setRowHeight (16);//设置每行的高度
  FileList.setSelectionForeground (Color.red);//设置所选择行的前景色
  FileList.doLayout ();
  FileList.setComponentPopupMenu(popup);
  FileList.getColumnModel().getColumn(1).setPreferredWidth(250); //设置列宽

  JScrollPane pane = new JScrollPane (FileList);
  JPanel p = new JPanel (new FlowLayout (6));

  p.add(label);
  p.add(keyWord);
  //p.add(docFile);
  //p.add(txtFile);
  //p.add(javaFile);
  p.add(file);
  
  JSeparator seph = new JSeparator();
  
  JPanel panel = new JPanel (new BorderLayout());
  panel.setPreferredSize (new Dimension (750,450));
  panel.add(seph,BorderLayout.NORTH);
  panel.add (pane,BorderLayout.CENTER);

  JFrame frame = new JFrame ("文档搜索");
  frame.setSize(750,450);
  frame.setVisible(true);
  frame.setDefaultCloseOperation (JFrame.EXIT_ON_CLOSE);

  Container con = frame.getContentPane();
  con.setLayout(new BorderLayout());
  con.add(p,BorderLayout.NORTH);
  con.add(panel,BorderLayout.CENTER);

  frame.pack();
  
  
  file.addActionListener(new ActionListener(){

   public void actionPerformed(ActionEvent arg0) {
    // TODO 自动生成方法存根
    ClearTabel();
    String KeyWord = keyWord.getText();
    FileScan scanfile = new FileScan();
    scanfile.setKeyWord(KeyWord);
    
    JFileChooser chooser = new JFileChooser();
    chooser.setFileSelectionMode(JFileChooser.DIRECTORIES_ONLY);
    JFrame fr = new JFrame ();
    int returnVal = chooser.showOpenDialog(fr);
    if(returnVal == JFileChooser.APPROVE_OPTION) {
     
     FileSearch fs = new FileSearch();
     String strPath = chooser.getSelectedFile().toString();
     fs.setFileList(strPath);
     
     //JOptionPane.showMessageDialog(null,"fs.setFileList(strPath);。");
     
     int size = fs.getFilePath().size();
     int ff = 0;
     javaReadDoc jdocreader = new javaReadDoc();
     
     for(int i=0;i<size;i++){
      
      String docpath = fs.getFilePath().get(i);
      String Text= null;
      
      
      Text = jdocreader.readDoc(docpath);

      
      if(scanfile.scanFile(Text)){
       
       ShowFileList(docpath,ff);
       ff++;
      }           
     }
     //JOptionPane.showMessageDialog(null,"fs.setFileList;。");
     
     ArrayList<String> failOpenList = jdocreader.getFailOpenFile();
     int fofSize = failOpenList.size();
     if(fofSize>0){
      ff++;
      for(int num=0;num<fofSize;num++){
       ff++;
       ShowFileList(failOpenList.get(num),ff);
      }
      ff=ff-fofSize-1;
      JOptionPane.showMessageDialog(null,"搜索完毕!共搜到" +ff+" 个结果。/n其中列表最后"+ fofSize +"个文件未能扫描");
     }
     else JOptionPane.showMessageDialog(null,"搜索完毕!共搜到" +ff+" 个结果。");
    }   
   }  
  });
  
  FileList.addMouseListener(new MouseAdapter() {
   public void mouseClicked(MouseEvent me) {
    if (me.getClickCount()==1) {
     choiceRow = FileList.getSelectedRow();
     jtabelFilePath = (String) FileList.getValueAt(choiceRow, 1);     
    }
   }
  });
  
  Action openRoomAction = new AbstractAction()
  {
   public void actionPerformed(ActionEvent actionEvent)
   {
    if (jtabelFilePath.equals(null)){
    }
    else{
     
    }
   }
  };

  openRoomAction.putValue(Action.NAME, "所在文件夹");
  //popup.add(openRoomAction);
  
  Action openfileAction = new AbstractAction()
  {
   public void actionPerformed(ActionEvent actionEvent)
   {
    if (jtabelFilePath.equals("")){
    }
    else{
     try {
      Runtime.getRuntime().exec("C://Program Files//Microsoft Office//OFFICE11//WINWORD.EXE    "+jtabelFilePath);
      } catch (IOException e) {
      JOptionPane.showMessageDialog(null,"打开文件时出错!");
      //System.out.println("打开文件出错!");
      e.printStackTrace();
     }
    }
   }
  };
  
  openfileAction.putValue(Action.NAME, "打开文件");
  popup.add(openfileAction);
  
  Action copyFilePath = new AbstractAction()
  {
   public void actionPerformed(ActionEvent actionEvent)
   {
    if (jtabelFilePath.equals("")){
    }
    
    else{
     Clipboard clipboard=Toolkit.getDefaultToolkit().getSystemClipboard(); //获得系统剪贴板
     StringSelection contents=new StringSelection(jtabelFilePath); //用拷贝文本框文本实例化StringSelection对象
     clipboard.setContents(contents, null); //设置系统剪贴板内容
    }
   }
  };
  copyFilePath.putValue(Action.NAME, "复制路径");
  popup.add(copyFilePath);

 
  Action delfileAction = new AbstractAction()
  {
   public void actionPerformed(ActionEvent actionEvent)
   {
    if (jtabelFilePath.equals("")){
    }
    
    else{
     FileTools fileTools = new FileTools();
     if(fileTools.delFile(jtabelFilePath)){
      FileList.setValueAt(null, choiceRow, 0);
      FileList.setValueAt(null, choiceRow, 1);
      FileList.setValueAt(null, choiceRow, 2);
      FileList.setValueAt(null, choiceRow, 3);
     }
     
    }
   }
  };
  delfileAction.putValue(Action.NAME, "删除文件");
  popup.add(delfileAction);
 }


 
 public static void main(String args[]){
  new SearchUI().showUI();
 }
 
 public void ShowFileList(String strPath,int row){
  java.io.File myFile = new java.io.File(strPath);
  FileList.setValueAt(myFile.getName(), row, 0);
  FileList.setValueAt(myFile.getPath(), row, 1);
  FileList.setValueAt(myFile.length()+" B", row, 2);
  java.util.Date dt = new Date(myFile.lastModified());
  FileList.setValueAt(dt.toLocaleString(), row, 3);
 }
 
 public void ClearTabel(){
  for (int i=0;i<JTabelColumnNum;i++){
   FileList.setValueAt(null, i, 0);
   FileList.setValueAt(null, i, 1);
   FileList.setValueAt(null, i, 2);
   FileList.setValueAt(null, i, 3);
  }
 }
 
 public Boolean getdocCheckBox(){
  return docFile.isSelected();
 }
 
 public Boolean gettxtCheckBox(){
  return txtFile.isSelected();
 }
 
 public Boolean getjavaCheckBox(){
  return javaFile.isSelected();
 }
 
}

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值