lucene-内存索引、内存索引保存在硬盘、索引优化

索引代码

package bindex;

import jeasy.analysis.MMAnalyzer;
import org.apache.lucene.analysis.PerFieldAnalyzerWrapper;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.LockObtainFailedException;
import org.apache.lucene.store.RAMDirectory;
import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.beans.LinkBean;
import org.htmlparser.filters.AndFilter;
import org.htmlparser.filters.HasAttributeFilter;
import org.htmlparser.filters.NotFilter;
import org.htmlparser.filters.OrFilter;
import org.htmlparser.filters.RegexFilter;
import org.htmlparser.filters.TagNameFilter;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;

public class perfieldindextest {

 /**
  * @param args
  */
 public static void main(String[] args) {
  // TODO Auto-generated method stub
  String indexpath="./indexes";

  IndexWriter writer;
  PerFieldAnalyzerWrapper wr;
  Document doc;
            try {
             RAMDirectory rd=new RAMDirectory();
             writer=new IndexWriter(rd,new StandardAnalyzer());
       wr=new PerFieldAnalyzerWrapper(new StandardAnalyzer());
       wr.addAnalyzer("title",new MMAnalyzer());
       wr.addAnalyzer("content", new MMAnalyzer());
       wr.addAnalyzer("author", new MMAnalyzer());       
       wr.addAnalyzer("time", new StandardAnalyzer());             
             //提取腾迅国内新闻链接
          LinkBean lb=new LinkBean();
          List baseurls=new ArrayList();
          baseurls.add("http://news.qq.com/china_index.shtml");
          baseurls.add("http://news.qq.com/world_index.shtml");
          baseurls.add("http://news.qq.com/society_index.shtml");
          for (int j=0;j<baseurls.size();j++){
              lb.setURL((String)baseurls.get(j));           
           URL[] urls=lb.getLinks();
          for (int i=0;i<urls.length;i++){
           doc=new Document();
                    String title="";
                    String content="";
                    String time="";
                    String author="";           
           System.out.println("正在提取"+(String)baseurls.get(j)+"第"+i+"个链接("+(int)(100*(i+1)/urls.length)+"%)["+urls[i].toString()+"].....");
           if  (!(urls[i].toString().startsWith("http://news.qq.com/a/"))){
            System.out.println("非新闻链接,忽略......");continue;
           }
           System.out.println("新闻链接,正在处理");           
           Parser parser=new Parser(urls[i].toString());
           parser.setEncoding("GBK");
        String url=urls[i].toString();
           NodeFilter filter_title=new TagNameFilter("title");   
        NodeList nodelist=parser.parse(filter_title);
        Node node_title=nodelist.elementAt(0);
           title=node_title.toPlainTextString();
        System.out.println("标题:"+title);
        parser.reset();
        NodeFilter filter_auth=new OrFilter(new HasAttributeFilter("class","auth"),new HasAttributeFilter("class","where")); 
        nodelist=parser.parse(filter_auth);
        Node node_auth=nodelist.elementAt(0);    
        if (node_auth != null) author=node_auth.toPlainTextString();
        else author="腾讯网";
        node_auth=nodelist.elementAt(1);
        if (node_auth != null) author+=node_auth.toPlainTextString();
        System.out.println("作者:"+author);   
        parser.reset();
        NodeFilter filter_time=new OrFilter(new HasAttributeFilter("class","info"),new RegexFilter("[0-9]{4}年[0-9]{1,2}月[0-9]{1,2}日[' ']*[0-9]{1,2}:[0-9]{1,2}")); 
        nodelist=parser.parse(filter_time);
        Node node_time=nodelist.elementAt(0);    
        if (node_time!=null) {
         if (node_time.getChildren()!=null) node_time=node_time.getFirstChild();
            time=node_time.toPlainTextString().replaceAll("[ |\t|\n|\f|\r\ ]","").substring(0,16);
        }        

        System.out.println("时间:"+time);   
        parser.reset();
        NodeFilter filter_content=new OrFilter(new OrFilter(new HasAttributeFilter("style","TEXT-INDENT: 2em"),new HasAttributeFilter("id","Cnt-Main-Article-QQ")),new HasAttributeFilter("id","ArticleCnt")); 
        nodelist=parser.parse(filter_content);
        Node node_content=nodelist.elementAt(0);  
           if (node_content!=null){           
             content=node_content.toPlainTextString().replaceAll("(#.*)|([a-z].*;)|}","").replaceAll(" |\t|\r|\n|\ ","");
        }
        System.out.println("内容:"+content);
     System.out.println("正在索引.....");            
        Field field=new Field("title",title,Field.Store.YES,Field.Index.TOKENIZED);
        doc.add(field);
        field=new Field("content",content,Field.Store.YES,Field.Index.TOKENIZED);
        doc.add(field);
        field=new Field("author",author,Field.Store.YES,Field.Index.UN_TOKENIZED);
        doc.add(field);
        field=new Field("time",time,Field.Store.YES,Field.Index.NO);
        doc.add(field);   
        field=new Field("url",url,Field.Store.YES,Field.Index.NO);
        doc.add(field); 
     writer.addDocument(doc,new MMAnalyzer());
     System.out.println("<"+title+"索引成功>");
          }
          }
                writer.close();
       wr.close();  

//内存索引写入硬盘文件
       FSDirectory fd=FSDirectory.getDirectory(indexpath);
       IndexWriter wi=new IndexWriter(fd,new MMAnalyzer());
       wi.addIndexes(new Directory[]{rd});
       //内存中文档最大值是80
       wi.setMaxMergeDocs(80);
       //内存中存储80个文档时写成磁盘一个块
       wi.setMergeFactor(80);
       wi.close();

    System.out.println("<索引建立完毕>");       
   } catch (ParserException e) {
    // TODO Auto-generated catch block
    e.printStackTrace();
   } catch (CorruptIndexException e) {
    // TODO Auto-generated catch block
    e.printStackTrace();
   } catch (IOException e) {
    // TODO Auto-generated catch block
    e.printStackTrace();
   }
 }

}

severlt代码:

package bservlet;

import javax.servlet.http.HttpServlet;
import javax.servlet.http.HttpServletRequest;
import javax.servlet.http.HttpServletResponse;

 

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.*;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.*;

import java.io.*;

import jeasy.analysis.MMAnalyzer;


public class SluceneSearcher extends HttpServlet {
 private String indexpath="D:/workspace/testsearch2/indexes";
 public void doPost(HttpServletRequest request,HttpServletResponse response){
  StringBuffer sb=new StringBuffer("");
  try {  
       request.setCharacterEncoding("GBK"); 
       String phrase=request.getParameter("phrase"); 
    Analyzer analyzer=new MMAnalyzer();
       IndexSearcher searcher;
    searcher = new IndexSearcher(indexpath);
    QueryParser parser=new QueryParser("content",analyzer);
    Query q= parser.parse(phrase);
    Hits hs=searcher.search(q);
    int num=hs.length();
    sb.append("<h1>您搜索到的记录数:"+num+"</h1>");
    for (int i=0;i<num;i++){
     Document doc=hs.doc(i);
     if (doc==null){
      continue;
     }     
     Field field_title=doc.getField("title");
     String title="<br><a href="+doc.getField("url").stringValue()+" target='_blank'>"+field_title.stringValue()+"</a><br>";
     Field field_author=doc.getField("author");
     String author="<br>author:<br>"+field_author.stringValue();
     Field field_time=doc.getField("time");
     String time="<br>time:<br>"+field_time.stringValue();
     sb.append(title);
     sb.append(author);
     sb.append(time);     
    }
    searcher.close();
   } catch (CorruptIndexException e1) {
    // TODO Auto-generated catch block
    e1.printStackTrace();
   } catch (IOException e1) {
    // TODO Auto-generated catch block
    e1.printStackTrace();
   } catch (ParseException e) {
     // TODO Auto-generated catch block
       e.printStackTrace();
      }   
  PrintWriter out;
  try {
   response.setContentType("text/html;charset=GBK");
   out = response.getWriter();
   out.print(sb.toString());
   out.close();
  } catch (IOException e) {
   // TODO Auto-generated catch block
   e.printStackTrace();
  } 

 }
 public void doGet(HttpServletRequest request,HttpServletResponse response){
  doPost(request,response);
 }

}

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值