要做一个跟文章标题相关的新闻,本来想简单做一下,就是把标题用分词处理一下,去除停用词,做个布尔查询,朋友建议lucene有一个 MoreLikeThisQuery,试了一下功能,觉得还可以,贴上示例代码 (MoreLikeThisQuery 在contrib 下的Queryies):
pom文件:
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd"> <modelVersion>4.0.0</modelVersion> <groupId>lucene-test</groupId> <artifactId>lucene-test</artifactId> <version>0.1-SNAPSHOT</version> <name>lucene-test</name> <dependencies> <dependency> <groupId>org.apache.lucene</groupId> <artifactId>lucene-core</artifactId> <version>2.9.2</version> <!-- 3.0.0 --> </dependency> <dependency> <groupId>org.apache.lucene</groupId> <artifactId>lucene-queries</artifactId> <version>2.9.2</version> <!-- 3.0.0 lucene-queries-2.9.2-dev.jar--> </dependency> </dependencies> <build> <plugins> <plugin> <artifactId>maven-compiler-plugin</artifactId> <configuration> <source>1.6</source> <target>1.6</target> <encoding>UTF-8</encoding> </configuration> </plugin> </plugins> </build> </project>
Java文件:
package lucene.test;
import java.io.File;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriter.MaxFieldLength;
import org.apache.lucene.search.Explanation;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.similar.MoreLikeThisQuery;
import org.apache.lucene.store.NIOFSDirectory;
import org.apache.lucene.util.Version;
//ref doc:
//http://www.iteye.com/topic/586043
//http://www.cnblogs.com/forfuture1978/archive/2010/05/19/1738803.html
//http://www.javadocexamples.com/java_source/org/apache/lucene/xmlparser/builders/LikeThisQueryBuilder.java.html
public class LuceneTestLike {
public static void main(String[] args) {
try{
String path = "./Index";
StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_29) ;
boolean isIndex = false; // true:要索引,false:表示要搜索
if(isIndex){
IndexWriter writer = new IndexWriter(new NIOFSDirectory(new File(path)),analyzer,MaxFieldLength.LIMITED);
Document doc_0 = new Document();
doc_0.add(new Field("Name","java 开发人员", Field.Store.YES, Field.Index.ANALYZED));
doc_0.add(new Field("Info","招聘 网站开发人员,要求一年或以上工作经验", Field.Store.YES, Field.Index.ANALYZED));
doc_0.add(new Field("Time","20100201", Field.Store.YES, Field.Index.NOT_ANALYZED));
writer.addDocument(doc_0);
Document doc_1 = new Document();
doc_1.add(new Field("Name","高级开发人员(java 方向)", Field.Store.YES, Field.Index.ANALYZED));
doc_1.add(new Field("Info","需要有四年或者以上的工作经验,有大型项目实践,java基本扎实", Field.Store.YES, Field.Index.ANALYZED));
doc_1.add(new Field("Time","20100131", Field.Store.YES, Field.Index.NOT_ANALYZED));
writer.addDocument(doc_1);
Document doc_2 = new Document();
doc_2.add(new Field("Name","php 开发工程师", Field.Store.YES, Field.Index.ANALYZED));
doc_2.add(new Field("Info","主要是维护公司的网站php开发,能独立完成网站的功能", Field.Store.YES, Field.Index.ANALYZED));
doc_2.add(new Field("Time","20100201", Field.Store.YES, Field.Index.NOT_ANALYZED));
writer.addDocument(doc_2);
Document doc_3 = new Document();
doc_3.add(new Field("Name","linux 管理员", Field.Store.YES, Field.Index.ANALYZED));
doc_3.add(new Field("Info","管理及维护公司的linux服务器,职责包括完成mysql数据备份及日常管理,apache的性能调优等", Field.Store.YES, Field.Index.ANALYZED));
doc_3.add(new Field("Time","20100201", Field.Store.YES, Field.Index.NOT_ANALYZED));
writer.addDocument(doc_3);
Document doc_4 = new Document();
doc_4.add(new Field("Name","lucene开发工程师", Field.Store.YES, Field.Index.ANALYZED));
doc_4.add(new Field("Info","需要两年或者以上的从事lucene java 开发工作的经验,需要对算法,排序规则等有相关经验,java水平及基础要扎实", Field.Store.YES, Field.Index.ANALYZED));
doc_4.add(new Field("Time","20100131", Field.Store.YES, Field.Index.NOT_ANALYZED));
writer.addDocument(doc_4);
Document doc_5 = new Document();
doc_5.add(new Field("Name","php 软件工程师", Field.Store.YES, Field.Index.ANALYZED));
doc_5.add(new Field("Info","具有大量的php开发经验,如熟悉 java 开发,数据库管理则更佳", Field.Store.YES, Field.Index.ANALYZED));
doc_5.add(new Field("Time","20100130", Field.Store.YES, Field.Index.NOT_ANALYZED));
writer.addDocument(doc_5);
writer.close();
System.out.println("数据索引完成");
}else{
IndexSearcher search = new IndexSearcher(new NIOFSDirectory(new File(path)),true);
String kw="php 开发工程师" ; //"lucene java"; //"开发工程师"; //"php 开发工程师";
String[] moreLikeFields = { "Name"} ; //{ "Name" ,"Info"};
MoreLikeThisQuery query = new MoreLikeThisQuery(kw, moreLikeFields, analyzer);
// 设置停用词
// query.setStopWords(getStopWords(reader));
//最少的词频
//Sets the frequency below which terms will be ignored in the source doc.
query.setMinTermFrequency(1);
//最多的查询词数目
//Sets the maximum number of query terms that will be included in any generated query.
query.setMaxQueryTerms(5);
//词至少在这么多篇文档中出现
//Ignore words which do not occur in at least this many docs. DEFAULT_MIN_DOC_FREQ = 5
query.setMinDocFreq(1);
System.out.println("搜索条件:" + query.toString());
long start = System.currentTimeMillis();
TopDocs tDocs = search.search(query,10);
ScoreDoc sDocs[] = tDocs.scoreDocs;
int len = sDocs.length;
for(int i=0;i<len;i++){
ScoreDoc tScore = sDocs[i];
//从Lucene3.0开始已经不能通过 tScore.score 这样来得到些文档的得分了
int docId = tScore.doc;
Explanation exp = search.explain(query, docId);
Document tDoc = search.doc(docId);
String Name = tDoc.get("Name");
String Info = tDoc.get("Info");
String Time = tDoc.get("Time");
float score = exp.getValue();
//System.out.println(exp.toString()); //如果需要打印文档得分的详细信息则可以通过此方法
System.out.println("DocId:"+docId+"\tScore:" + score + "\tName:" + Name + "\tTime:" + Time + "\tInfo:" + Info);
}
Long end = System.currentTimeMillis();
System.out.println("搜索用时:" + (end -start) + "ms");
search.close();
}
}catch(Exception ex){
ex.printStackTrace();
}
}
}
运行结果:
搜索条件:like:php 开发工程师
DocId:2 Score:1.1971036 Name:php 开发工程师 Time:20100201 Info:主要是维护公司的网站php开发,能独立完成网站的功能
DocId:5 Score:0.82631415 Name:php 软件工程师 Time:20100130 Info:具有大量的php开发经验,如熟悉 java 开发,数据库管理则更佳
DocId:4 Score:0.6882751 Name:lucene开发工程师 Time:20100131 Info:需要两年或者以上的从事lucene java 开发工作的经验,需要对算法,排序规则等有相关经验,java水平及基础要扎实
DocId:0 Score:0.038315877 Name:java 开发人员 Time:20100201 Info:招聘 网站开发人员,要求一年或以上工作经验
DocId:1 Score:0.027368484 Name:高级开发人员(java 方向) Time:20100131 Info:需要有四年或者以上的工作经验,有大型项目实践,java基本扎实
搜索用时:47ms