lecene.net实现pdf,doc,xls,ppt,htm,html等格式文件的检索

代码如下,代码没有优化,仅实现功能
该代码复制到程序中不能直接使用,需要下载文章最后的例子,取得其中得dll后才可以

using  System;
using  System.Configuration;
using  System.Data;
using  System.Linq;
using  System.Web;
using  System.Web.Security;
using  System.Web.UI;
using  System.Web.UI.HtmlControls;
using  System.Web.UI.WebControls;
using  System.Web.UI.WebControls.WebParts;
using  System.Xml.Linq;
using  System.Text;
using  System.IO;

using  Lucene.Net.Documents;
using  Lucene.Net.Index;
using  Lucene.Net.Search;
using  Lucene.Net.QueryParsers;
using  Lucene.Net.Analysis.Standard;

using  Lucene.Net.Analysis.Cn;


using  org.pdfbox.pdmodel;
using  org.pdfbox.util;

using  System.Text.RegularExpressions;

public   partial   class  _Default : System.Web.UI.Page
{
    
public  DateTime start  =   new  DateTime();
    
delegate   void  AsyncIndexDirectoryCaller(IndexWriter writer, FileInfo file);
    IndexSearcher searcher 
=   null ;

    
protected   void  Page_Load( object  sender, EventArgs e)
    {
        
if  ( ! IsPostBack)
            TextBox3.Text 
=  Server.MapPath( " doc " );
    }


    
#region  建立索引
    
protected   void  Button2_Click( object  sender, EventArgs e)
    {
        
string  INDEX_STORE_PATH  =  Server.MapPath( " index " );   // INDEX_STORE_PATH 为索引存储目录
         string  INDEX_PATH  =  TextBox3.Text;   // INDEX_PATH 为搜索目录

        IndexWriter writer 
=   null ;
        
try
        {
            writer 
=   new  IndexWriter(INDEX_STORE_PATH,  new  ChineseAnalyzer(),  true );
            start 
=  DateTime.Now;

            IndexDirectory(writer, 
new  FileInfo(INDEX_PATH));
            writer.Optimize();
            writer.Close();

            TimeSpan s 
=  DateTime.Now  -  start;

            TextBox1.Text 
=   " 提示:索引完成,共用时  "   +  s.TotalSeconds  +   "  秒/n " ;

        }
        
catch  (Exception ex)
        {
            TextBox4.Text 
=  ex.Message.ToString();
        }


    }

    
public   void  IndexDirectory(IndexWriter writer, FileInfo file)
    {
        
if  (Directory.Exists(file.FullName))
        {
            String[] files 
=  Directory.GetFileSystemEntries(file.FullName);

            
if  (files  !=   null )
            {
                
for  ( int  i  =   0 ; i  <  files.Length; i ++ )
                {
                    IndexDirectory(writer, 
new  FileInfo(files[i]));   // 这里是一个递归 
                }
            }
        }
        
else   if  (file.Extension.ToLower()  ==   " .txt "   ||  file.Extension.ToLower()  ==   " .htm "   ||  file.Extension.ToLower()  ==   " .html "   ||  file.Extension.ToLower()  ==   " .pdf "   ||  file.Extension.ToLower()  ==   " .doc "   ||  file.Extension.ToLower()  ==   " .rtf "   ||  file.Extension.ToLower()  ==   " .ppt "   ||  file.Extension.ToLower()  ==   " .xls " )
        {
            IndexFile(file, writer);
        }
    }

    
private   void  IndexFile(FileInfo file, IndexWriter writer)
    {

        
try
        {
            
if  (file.Extension.ToLower()  ==   " .pdf " )
            {
                Document doc 
=   new  Document();

                PDDocument pddoc 
=  PDDocument.load(file.FullName);  
                PDFTextStripper stripper 
=   new  PDFTextStripper();

                doc.Add(
new  Field( " filename " , file.FullName, Field.Store.YES, Field.Index.UN_TOKENIZED));

                doc.Add(
new  Field( " contents " , stripper.getText(pddoc), Field.Store.NO, Field.Index.TOKENIZED));

                writer.AddDocument(doc);
            }
            
else   if  (file.Extension.ToLower()  ==   " .doc " )
            {
                Document doc 
=   new  Document();
                
string  str  =   "" ;
                
//
                Microsoft.Office.Interop.Word.ApplicationClass wordApp  =   new  Microsoft.Office.Interop.Word.ApplicationClass();
                
object  filePath  =  file.FullName;
                
object  nullobj  =  System.Reflection.Missing.Value;
                Microsoft.Office.Interop.Word.Document docdoc 
=  wordApp.Documents.Open(
                    
ref  filePath,  ref  nullobj,  ref  nullobj,  ref  nullobj,  ref  nullobj,  ref  nullobj,
                    
ref  nullobj,  ref  nullobj,  ref  nullobj,  ref  nullobj,  ref  nullobj,  ref  nullobj,
                    
ref  nullobj,  ref  nullobj,  ref  nullobj,  ref  nullobj);
                docdoc.ActiveWindow.Selection.WholeStory();

                str 
=  docdoc.ActiveWindow.Selection.Text.ToString();
                docdoc.Close(
ref  nullobj,  ref  nullobj,  ref  nullobj);
                wordApp.Quit(
ref  nullobj,  ref  nullobj,  ref  nullobj);
                
//

                doc.Add(
new  Field( " filename " , file.FullName, Field.Store.YES, Field.Index.UN_TOKENIZED));

                doc.Add(
new  Field( " contents " , str, Field.Store.NO, Field.Index.TOKENIZED));

                writer.AddDocument(doc);

            }
            
else   if  (file.Extension.ToLower()  ==   " .rtf " )     // word的方式可以解决rtf文件的读取
            {
                Document doc 
=   new  Document();
                
string  str  =   "" ;
                
//
                Microsoft.Office.Interop.Word.ApplicationClass wordApp  =   new  Microsoft.Office.Interop.Word.ApplicationClass();
                
object  filePath  =  file.FullName;
                
object  nullobj  =  System.Reflection.Missing.Value;
                Microsoft.Office.Interop.Word.Document docdoc 
=  wordApp.Documents.Open(
                    
ref  filePath,  ref  nullobj,  ref  nullobj,  ref  nullobj,  ref  nullobj,  ref  nullobj,
                    
ref  nullobj,  ref  nullobj,  ref  nullobj,  ref  nullobj,  ref  nullobj,  ref  nullobj,
                    
ref  nullobj,  ref  nullobj,  ref  nullobj,  ref  nullobj);
                docdoc.ActiveWindow.Selection.WholeStory();

                str 
=  docdoc.ActiveWindow.Selection.Text.ToString();
                docdoc.Close(
ref  nullobj,  ref  nullobj,  ref  nullobj);
                wordApp.Quit(
ref  nullobj,  ref  nullobj,  ref  nullobj);
                
//
                
                doc.Add(
new  Field( " filename " , file.FullName, Field.Store.YES, Field.Index.UN_TOKENIZED));

                doc.Add(
new  Field( " contents " , str, Field.Store.NO, Field.Index.TOKENIZED));

                writer.AddDocument(doc);
            }
            
else   if  (file.Extension.ToLower()  ==   " .ppt " )
            {
                Document doc 
=   new  Document();
                
string  str  =   "" ;
                
//
                PowerPoint.ApplicationClass pptApp  =   new  PowerPoint.ApplicationClass();
                PowerPoint.Presentation pptPre 
=  pptApp.Presentations.Open(file.FullName,
                            Microsoft.Office.Core.MsoTriState.msoTrue,
                            Microsoft.Office.Core.MsoTriState.msoFalse,
                            Microsoft.Office.Core.MsoTriState.msoFalse);

                
foreach  (PowerPoint.Slide slide  in  pptPre.Slides)
                {
                    
foreach  (PowerPoint.Shape shape  in  slide.Shapes)
                    {
                        
try
                        {
                            str 
=  str  +  shape.TextFrame.TextRange.Text;
                        }
                        
catch  { }
                    }
                }
                pptPre.Close();
                pptApp.Quit();
                
//

                doc.Add(
new  Field( " filename " , file.FullName, Field.Store.YES, Field.Index.UN_TOKENIZED));

                doc.Add(
new  Field( " contents " , str, Field.Store.NO, Field.Index.TOKENIZED));

                writer.AddDocument(doc);
            }
            
else   if  (file.Extension.ToLower()  ==   " .xls " )
            {
                Document doc 
=   new  Document();
                
string  str  =   "" ;

                
//
                Microsoft.Office.Interop.Excel.Application xApp  =   new  Microsoft.Office.Interop.Excel.ApplicationClass();
                
// xApp.Visible = true;

                
object  nullobj  =  System.Reflection.Missing.Value;

                Microsoft.Office.Interop.Excel.Workbook xBook 
=  xApp.Workbooks._Open(file.FullName,
                nullobj, nullobj, nullobj, nullobj, nullobj, nullobj, nullobj, nullobj, nullobj, nullobj, nullobj, nullobj);

                Microsoft.Office.Interop.Excel.Worksheet xSheet;
                
int  rcount, ccount;

                
for  ( int  i  =   0 ; i  <  xBook.Sheets.Count; i ++ )
                {
                    xSheet 
=  (Microsoft.Office.Interop.Excel.Worksheet)xBook.Sheets[i  +   1 ];

                    rcount 
=  xSheet.UsedRange.Rows.Count;
                    ccount 
=  xSheet.UsedRange.Columns.Count;

                    
for  ( int  m  =   0 ; m  <  rcount; m ++ )
                    {
                        
for  ( int  n  =   0 ; n  <  ccount; n ++ )
                        {
                            str 
=  str  +  ((Microsoft.Office.Interop.Excel.Range)xSheet.Cells[m  +   1 , n  +   1 ]).Value2;
                        }
                    }

                }
                xSheet 
=   null ;
                xBook.Close(nullobj, nullobj, nullobj);
                xApp.Quit();
                
//

                doc.Add(
new  Field( " filename " , file.FullName, Field.Store.YES, Field.Index.UN_TOKENIZED));

                doc.Add(
new  Field( " contents " , str, Field.Store.NO, Field.Index.TOKENIZED));

                writer.AddDocument(doc);

            }
            
else   if  (file.Extension.ToLower()  ==   " .htm "   ||  file.Extension.ToLower()  ==   " .html " )
            {

                Document doc 
=   new  Document();
                
string  str  =   "" ;
                str 
=  NoHTML(File.ReadAllText(file.FullName));

                doc.Add(
new  Field( " filename " , file.FullName, Field.Store.YES, Field.Index.UN_TOKENIZED));

                doc.Add(
new  Field( " contents " new  StreamReader(file.FullName, System.Text.Encoding.Default)));

                writer.AddDocument(doc);
            }
            
else      // 默认是文本文件
            {
                Document doc 
=   new  Document();

                doc.Add(
new  Field( " filename " , file.FullName, Field.Store.YES, Field.Index.UN_TOKENIZED));

                doc.Add(
new  Field( " contents " new  StreamReader(file.FullName, System.Text.Encoding.Default)));

                writer.AddDocument(doc);
            }
        }

        
catch  (FileNotFoundException fnfe)
        {
            TextBox4.Text 
=  TextBox4.Text  +  fnfe.Message  +   " /n " ;
            
return ;
        }
    }

    
public   static   string  NoHTML( string  Htmlstring) // 过滤调html的标签
    {
        
// 删除脚本 
        Htmlstring  =  Regex.Replace(Htmlstring,  @" <script[^>]*?>.*?</script> " "" , RegexOptions.IgnoreCase);
        
// 删除HTML 
        Htmlstring  =  Regex.Replace(Htmlstring,  @" <(.[^>]*)> " "" , RegexOptions.IgnoreCase);
        Htmlstring 
=  Regex.Replace(Htmlstring,  @" ([/r/n])[/s]+ " "" , RegexOptions.IgnoreCase);
        Htmlstring 
=  Regex.Replace(Htmlstring,  @" --> " "" , RegexOptions.IgnoreCase);
        Htmlstring 
=  Regex.Replace(Htmlstring,  @" <!--.* " "" , RegexOptions.IgnoreCase);
        Htmlstring 
=  Regex.Replace(Htmlstring,  @" &(quot|#34); " " / "" , RegexOptions.IgnoreCase);
        Htmlstring  =  Regex.Replace(Htmlstring,  @" &(amp|#38); " " & " , RegexOptions.IgnoreCase);
        Htmlstring 
=  Regex.Replace(Htmlstring,  @" &(lt|#60); " " < " , RegexOptions.IgnoreCase);
        Htmlstring 
=  Regex.Replace(Htmlstring,  @" &(gt|#62); " " > " , RegexOptions.IgnoreCase);
        Htmlstring 
=  Regex.Replace(Htmlstring,  @" &(nbsp|#160); " "   " , RegexOptions.IgnoreCase);
        Htmlstring 
=  Regex.Replace(Htmlstring,  @" &(iexcl|#161); " " /xa1 " , RegexOptions.IgnoreCase);
        Htmlstring 
=  Regex.Replace(Htmlstring,  @" &(cent|#162); " " /xa2 " , RegexOptions.IgnoreCase);
        Htmlstring 
=  Regex.Replace(Htmlstring,  @" &(pound|#163); " " /xa3 " , RegexOptions.IgnoreCase);
        Htmlstring 
=  Regex.Replace(Htmlstring,  @" &(copy|#169); " " /xa9 " , RegexOptions.IgnoreCase);
        Htmlstring 
=  Regex.Replace(Htmlstring,  @" &#(/d+); " "" , RegexOptions.IgnoreCase);
        Htmlstring.Replace(
" < " "" );
        Htmlstring.Replace(
" > " "" );
        Htmlstring.Replace(
" /r/n " "" );
        Htmlstring 
=  HttpContext.Current.Server.HtmlEncode(Htmlstring).Trim();
        
return  Htmlstring;
    }
    
#endregion

    
#region  搜索
    
protected   void  Button1_Click( object  sender, EventArgs e)
    {
        
string  INDEX_STORE_PATH  =  Server.MapPath( " index " );   // INDEX_STORE_PATH 为索引存储目录
         string  KEYWORD  =  TextBox2.Text;

        
try
        {
            searcher 
=   new  IndexSearcher(INDEX_STORE_PATH);

            QueryParser q 
=   new  QueryParser( " contents " new  ChineseAnalyzer());

            Query query 
=  q.Parse(KEYWORD);


            Hits hits 
=  searcher.Search(query);

            printResult(hits);

            searcher.Close();
        }
        
catch  (Exception ex)
        {
            TextBox4.Text 
=  TextBox4.Text  +  ex.Message.ToString();
        }
    }

    
void  printResult(Hits h)
    {
        
string  str  =   "" ;
        
if  (h.Length()  ==   0 )
        {
            str 
=  str  +   " 对不起,没有搜索到你要的结果。/n " ;
        }
        
else
        {
            
for  ( int  i  =   0 ; i  <  h.Length(); i ++ )
            {
                
try
                {
                    Document doc 
=  h.Doc(i);
                    str 
=  str  +   " 这是第 "   +  (i  +   1 +   " 个搜索结果,文件路径为:  "   +  doc.Get( " filename " +   " /n " ;
                }
                
catch  (Exception ex)
                {
                    TextBox4.Text 
=  TextBox4.Text  +  ex.Message;
                }
            }
        }
        str 
=  str  +   " ---------------------------/n " ;
        TextBox1.Text 
=  str;
    }

    
#endregion

}


完整demo下载,点击下载

 

【reprinted from http://www.cnblogs.com/weekzero/archive/2008/06/11/1217521.html

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值