对于Java的word解析,主流方法是用中间件去访问,但是需要用JNI.
Apache和微软合作,开发的POI,但是POI虽然功能强大,但是用起来也很麻烦
用
Java 对 word解析, 抛弃使用 中间件 繁琐的方法 , 直接使用 tm-extractor 对word进行解析检索。
tm-extractors0.4版本不需要apache的poi包,因为里面已经整合了
tm-extractors1.0版本需要poi-3.0.1-FINAL-20070705.jar,版本一定要正确
tm-extractors官方网站
废话少说,直接上图和代码:
package docSearch;
import java.io.File;
import java.util.ArrayList;
public class FileSearch {
public ArrayList<String> filelist = new ArrayList<String>();
public void setFileList(String strPath) {
File dir = new File(strPath);
File[] files = dir.listFiles();
// SearchUI sui = new SearchUI();
// Boolean docselect = sui.getdocCheckBox();
// Boolean txtselect = sui.gettxtCheckBox();
// Boolean javaselect = sui.getjavaCheckBox();
if (files == null)
return;
for (int i = 0; i < files.length; i++) {
if (files[i].isDirectory()) {
setFileList(files[i].getAbsolutePath());
} else {
String strFileName = files[i].getAbsolutePath().toLowerCase();
if((strFileName.endsWith(".doc"))&&
((!files[i].getName().startsWith("~$"))||(!(files[i].length()<500)))){
//System.out.println("---"+strFileName+files[i].length());
filelist.add(files[i].getAbsolutePath());
}
// else if(strFileName.endsWith(".java")){
// filelist.add(files[i].getAbsolutePath());
// }
// else if(strFileName.endsWith(".txt")){
// filelist.add(files[i].getAbsolutePath());
// }
else{
}
}
}
}
public ArrayList<String> getFilePath(){
return filelist;
}
}
package docSearch;
import java.io.File;
import java.util.ArrayList;
import java.util.StringTokenizer;
public class FileScan {
public ArrayList<String> keyWord = new ArrayList<String>();
public ArrayList<String> FilePathList = new ArrayList<String>();
public void setFilePathList(ArrayList<String> filePathList){
this.FilePathList = filePathList;
}
public void setKeyWord(String Keyword){
/* StringTokenizer st = new StringTokenizer(Keyword);
while (st.hasMoreTokens()) {
keyWord.add(st.nextToken());
}*/
keyWord.add(Keyword);
}
/*
*/
public Boolean scanFile(String fileText){
String kw = null;
int length = keyWord.size();
for(int i=0;i<length;i++){
kw = keyWord.get(i);
//System.out.println(kw);
//if(fileText.indexOf(kw)>=0)return true;
if(FindSubString( fileText, kw)!=-1) return true;
}
return false;
}
public String getFileText(){
String fileText = null;
return fileText;
}
public int FindSubString(String fileText,String keyword){
if ((fileText.equals("")) || (keyword.equals(""))){
return -1;
}
int i = 0;
int j = 0;
int nLenMain = fileText.length();
int nLenSub = keyword.length();
if (nLenSub > nLenMain){
return -1;
}
while ((i < nLenMain) && (j < nLenSub)){
if (fileText.charAt(i) == keyword.charAt(j)){
++i;
++j;
}
else{
i = i - j + 1;
j = 0;
}
}
if (j == nLenSub){
return i - j;
}
else{
return -1;
}
}
}
package docSearch;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;
import javax.swing.JOptionPane;
import org.textmining.extraction.word.WordTextExtractor;
import org.textmining.extraction.word.WordTextExtractorFactory;
public class javaReadDoc {
public ArrayList<String> failOpenList = new ArrayList<String>();
public String readDoc(String docpath){
String text = null;
if(docpath.endsWith(".doc")){
FileInputStream in = null;
try {
in = new FileInputStream(new File(docpath));
} catch (FileNotFoundException e) {
// TODO Auto-generated catch block
System.out.println("新建"+docpath+"文件数据流失败!");
e.printStackTrace();
}
WordTextExtractorFactory extractor = new WordTextExtractorFactory();
try {
WordTextExtractor ex = (WordTextExtractor) extractor.textExtractor(in);
text = ex.getText();
//text = extractor.textExtractor(in).getText();
} catch (IOException e) {
// TODO Auto-generated catch block
System.out.println("读入"+docpath+"文件失败!");
e.printStackTrace();
} catch (Exception e) {
// TODO Auto-generated catch block
//JOptionPane.showMessageDialog(null,"读取 "+docpath+" 文件失败!/n按确定跳过该文件,继续搜索。");
failOpenList.add(docpath);
//System.out.println(docpath+"文件操作失败!");
return "的";
//e.printStackTrace();
}
}
else{
try {
BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(docpath)));
String data = null;
while((data = br.readLine())!=null)
{
text=text+data;
}
} catch (IOException e) {
// TODO 自动生成 catch 块
failOpenList.add(docpath);
return "的";
}
}
return text;
}
public ArrayList<String> getFailOpenFile(){
return this.failOpenList;
}
}
package docSearch;
import java.awt.BorderLayout;
import java.awt.Color;
import java.awt.Container;
import java.awt.Dimension;
import java.awt.FlowLayout;
import java.awt.Toolkit;
import java.awt.datatransfer.Clipboard;
import java.awt.datatransfer.StringSelection;
import java.awt.event.ActionEvent;
import java.awt.event.ActionListener;
import java.awt.event.MouseAdapter;
import java.awt.event.MouseEvent;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Date;
import javax.swing.*;
public class SearchUI{
private JButton file = new JButton("选择文件夹");
private JLabel label = new JLabel(" 关键字:");
private JTextField keyWord = new JTextField("",26);//在这里输入关键字,多个关键字用空格隔开
private JCheckBox docFile = new JCheckBox(".doc");
private JCheckBox javaFile = new JCheckBox(".java");
private JCheckBox txtFile = new JCheckBox(".txt");
private JPopupMenu popup;
private String jtabelFilePath= null;
private final int JTabelColumnNum = 180;
final Object[] columnNames = {"文件名", "文件路径", "文件大小","修改日期"};
Object[][] rowData = new Object[JTabelColumnNum][4];
private JTable FileList = new JTable (rowData,columnNames);
private int choiceRow = -1;
public void showUI(){
popup = new JPopupMenu();
FileList.setPreferredScrollableViewportSize(new Dimension(600, 100));//设置表格的大小
FileList.setRowHeight (16);//设置每行的高度
FileList.setSelectionForeground (Color.red);//设置所选择行的前景色
FileList.doLayout ();
FileList.setComponentPopupMenu(popup);
FileList.getColumnModel().getColumn(1).setPreferredWidth(250); //设置列宽
JScrollPane pane = new JScrollPane (FileList);
JPanel p = new JPanel (new FlowLayout (6));
p.add(label);
p.add(keyWord);
//p.add(docFile);
//p.add(txtFile);
//p.add(javaFile);
p.add(file);
JSeparator seph = new JSeparator();
JPanel panel = new JPanel (new BorderLayout());
panel.setPreferredSize (new Dimension (750,450));
panel.add(seph,BorderLayout.NORTH);
panel.add (pane,BorderLayout.CENTER);
JFrame frame = new JFrame ("文档搜索");
frame.setSize(750,450);
frame.setVisible(true);
frame.setDefaultCloseOperation (JFrame.EXIT_ON_CLOSE);
Container con = frame.getContentPane();
con.setLayout(new BorderLayout());
con.add(p,BorderLayout.NORTH);
con.add(panel,BorderLayout.CENTER);
frame.pack();
file.addActionListener(new ActionListener(){
public void actionPerformed(ActionEvent arg0) {
// TODO 自动生成方法存根
ClearTabel();
String KeyWord = keyWord.getText();
FileScan scanfile = new FileScan();
scanfile.setKeyWord(KeyWord);
JFileChooser chooser = new JFileChooser();
chooser.setFileSelectionMode(JFileChooser.DIRECTORIES_ONLY);
JFrame fr = new JFrame ();
int returnVal = chooser.showOpenDialog(fr);
if(returnVal == JFileChooser.APPROVE_OPTION) {
FileSearch fs = new FileSearch();
String strPath = chooser.getSelectedFile().toString();
fs.setFileList(strPath);
//JOptionPane.showMessageDialog(null,"fs.setFileList(strPath);。");
int size = fs.getFilePath().size();
int ff = 0;
javaReadDoc jdocreader = new javaReadDoc();
for(int i=0;i<size;i++){
String docpath = fs.getFilePath().get(i);
String Text= null;
Text = jdocreader.readDoc(docpath);
if(scanfile.scanFile(Text)){
ShowFileList(docpath,ff);
ff++;
}
}
//JOptionPane.showMessageDialog(null,"fs.setFileList;。");
ArrayList<String> failOpenList = jdocreader.getFailOpenFile();
int fofSize = failOpenList.size();
if(fofSize>0){
ff++;
for(int num=0;num<fofSize;num++){
ff++;
ShowFileList(failOpenList.get(num),ff);
}
ff=ff-fofSize-1;
JOptionPane.showMessageDialog(null,"搜索完毕!共搜到" +ff+" 个结果。/n其中列表最后"+ fofSize +"个文件未能扫描");
}
else JOptionPane.showMessageDialog(null,"搜索完毕!共搜到" +ff+" 个结果。");
}
}
});
FileList.addMouseListener(new MouseAdapter() {
public void mouseClicked(MouseEvent me) {
if (me.getClickCount()==1) {
choiceRow = FileList.getSelectedRow();
jtabelFilePath = (String) FileList.getValueAt(choiceRow, 1);
}
}
});
Action openRoomAction = new AbstractAction()
{
public void actionPerformed(ActionEvent actionEvent)
{
if (jtabelFilePath.equals(null)){
}
else{
}
}
};
openRoomAction.putValue(Action.NAME, "所在文件夹");
//popup.add(openRoomAction);
Action openfileAction = new AbstractAction()
{
public void actionPerformed(ActionEvent actionEvent)
{
if (jtabelFilePath.equals("")){
}
else{
try {
Runtime.getRuntime().exec("C://Program Files//Microsoft Office//OFFICE11//WINWORD.EXE "+jtabelFilePath);
} catch (IOException e) {
JOptionPane.showMessageDialog(null,"打开文件时出错!");
//System.out.println("打开文件出错!");
e.printStackTrace();
}
}
}
};
openfileAction.putValue(Action.NAME, "打开文件");
popup.add(openfileAction);
Action copyFilePath = new AbstractAction()
{
public void actionPerformed(ActionEvent actionEvent)
{
if (jtabelFilePath.equals("")){
}
else{
Clipboard clipboard=Toolkit.getDefaultToolkit().getSystemClipboard(); //获得系统剪贴板
StringSelection contents=new StringSelection(jtabelFilePath); //用拷贝文本框文本实例化StringSelection对象
clipboard.setContents(contents, null); //设置系统剪贴板内容
}
}
};
copyFilePath.putValue(Action.NAME, "复制路径");
popup.add(copyFilePath);
Action delfileAction = new AbstractAction()
{
public void actionPerformed(ActionEvent actionEvent)
{
if (jtabelFilePath.equals("")){
}
else{
FileTools fileTools = new FileTools();
if(fileTools.delFile(jtabelFilePath)){
FileList.setValueAt(null, choiceRow, 0);
FileList.setValueAt(null, choiceRow, 1);
FileList.setValueAt(null, choiceRow, 2);
FileList.setValueAt(null, choiceRow, 3);
}
}
}
};
delfileAction.putValue(Action.NAME, "删除文件");
popup.add(delfileAction);
}
public static void main(String args[]){
new SearchUI().showUI();
}
public void ShowFileList(String strPath,int row){
java.io.File myFile = new java.io.File(strPath);
FileList.setValueAt(myFile.getName(), row, 0);
FileList.setValueAt(myFile.getPath(), row, 1);
FileList.setValueAt(myFile.length()+" B", row, 2);
java.util.Date dt = new Date(myFile.lastModified());
FileList.setValueAt(dt.toLocaleString(), row, 3);
}
public void ClearTabel(){
for (int i=0;i<JTabelColumnNum;i++){
FileList.setValueAt(null, i, 0);
FileList.setValueAt(null, i, 1);
FileList.setValueAt(null, i, 2);
FileList.setValueAt(null, i, 3);
}
}
public Boolean getdocCheckBox(){
return docFile.isSelected();
}
public Boolean gettxtCheckBox(){
return txtFile.isSelected();
}
public Boolean getjavaCheckBox(){
return javaFile.isSelected();
}
}