lucene全文检索学习记录，附带源码——三种实现，超全超细致

最新推荐文章于 2024-07-04 14:41:08 发布

weixin_34355559

最新推荐文章于 2024-07-04 14:41:08 发布

阅读量88

点赞数

原文链接：https://my.oschina.net/u/1433614/blog/189885

版权

2019独角兽企业重金招聘Python工程师标准>>>

Lucene学习记录

———三种实现，超全超细致

下载lucene3.6.0.zip http://download.csdn.net/detail/leilovegege/6800405 ，解压，将里边的lucene-core-3.6.0.jar等包拷贝到工程lib中。还需要中文分词器IKAnalyzer3.2.8.jar，ojdbc14.jar

至此环境搭配完成。

下面开始实现，只用lava类，没连接web页面，所以在测试时执行java文件，在控制台进行测试。

工程原文件lucene36 http://download.csdn.net/detail/leilovegege/6804669

Test1检索绝对路径上的文件

package test1;

import java.io.BufferedReader;

import java.io.File;

import java.io.FileInputStream;

import java.io.FileReader;

import java.io.IOException;

import java.io.InputStreamReader;

import java.util.Date;

import org.apache.lucene.analysis.Analyzer;

import org.apache.lucene.analysis.standard.StandardAnalyzer;

import org.apache.lucene.document.Document;

import org.apache.lucene.document.Field;

import org.apache.lucene.document.FieldSelectorResult;

import org.apache.lucene.document.NumericField;

import org.apache.lucene.index.IndexWriter;

import org.apache.lucene.index.IndexWriterConfig;

import org.apache.lucene.index.IndexWriterConfig.OpenMode;

import org.apache.lucene.store.Directory;

import org.apache.lucene.store.FSDirectory;

import org.apache.lucene.util.Version;

public class TestFileIndexer {

public static void main(String[] args) throws Exception {

/* 指明要索引文件夹的位置,这里是C盘的source文件夹下 */

File fileDir = new File( "E:\\Documents and Settings\\Administrator\\Workspaces\\MyEclipse 8.6\\lucene36\\source" );

/* 这里放索引文件的位置 */

File indexDir = new File( ".\\index\\test1" );

Directory dir=FSDirectory.open(indexDir);//将索引存放在磁盘上

Analyzer lucenAnalyzer=new StandardAnalyzer(Version.LUCENE_36);//分析器

IndexWriterConfig iwc=new IndexWriterConfig(Version.LUCENE_36,lucenAnalyzer);

iwc.setOpenMode(OpenMode.CREATE);//创建新的索引文件create 表示创建或追加到已有索引库

IndexWriter indexWriter=new IndexWriter(dir,iwc);//把文档写入到索引库

File[] textFiles=fileDir.listFiles();//得到索引文件夹下所有文件

long startTime=new Date().getTime();

//增加document到检索去

for (int i = 0; i < textFiles.length; i++) {

// if (textFiles[i].isFile()&& textFiles[i].getName().endsWith(".txt")) {

System.out.println(":;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;");

System.out.println("File"+textFiles[i].getCanonicalPath()+"正在被索引...");

String temp=FileReaderAll(textFiles[i].getCanonicalPath(),"GBK");

System.out.println(temp);

Document document=new Document();

Field FieldPath=new Field("path",textFiles[i].getPath(),Field.Store.YES,Field.Index.NO);

Field FieldBody=new Field("body",temp,Field.Store.YES,Field.Index.ANALYZED,Field.TermVector.WITH_POSITIONS_OFFSETS);

NumericField modifiField=new NumericField("modified");//所以key为modified

modifiField.setLongValue(fileDir.lastModified());

document.add(FieldPath);

document.add(FieldBody);

document.add(modifiField);

indexWriter.addDocument(document);

// }

}

indexWriter.close();

//计算一下索引的时间

long endTime=new Date().getTime();

System.out.println("花了"+(endTime-startTime)+"毫秒把文档添加到索引里面去"+fileDir.getPath());

}

public static String FileReaderAll(String FileName,String charset)throws IOException{

BufferedReader reader=new BufferedReader(new InputStreamReader(new FileInputStream(FileName),charset));

String line=new String();

String temp=new String();

while ((line=reader.readLine())!=null) {

temp+=line;

}

reader.close();

return temp;

}

package test1;

import java.io.BufferedReader;

import java.io.File;

import java.io.IOException;

import java.io.InputStreamReader;

import java.util.Scanner;

import org.apache.lucene.analysis.Analyzer;

import org.apache.lucene.analysis.standard.StandardAnalyzer;

import org.apache.lucene.document.Document;

import org.apache.lucene.index.IndexReader;

import org.apache.lucene.queryParser.ParseException;

import org.apache.lucene.queryParser.QueryParser;

import org.apache.lucene.search.IndexSearcher;

import org.apache.lucene.search.Query;

import org.apache.lucene.search.ScoreDoc;

import org.apache.lucene.search.TopDocs;

import org.apache.lucene.store.FSDirectory;

import org.apache.lucene.util.Version;

public class TestQuery {

public static void main(String[] args) throws ParseException, IOException {

String index="./index/test1";//搜索的索引路径

IndexReader reader=IndexReader.open(FSDirectory.open(new File(index)));

IndexSearcher searcher=new IndexSearcher(reader);//检索工具

ScoreDoc[] hits=null;

// BufferedReader reader1=new BufferedReader(new InputStreamReader(System.in));

// String queryString=reader1.readLine().toString(); //搜索关键字

// Scanner sca=new Scanner(System.in);

// String queryString=sca.next().toString();

String queryString="测试";

Query query=null;

Analyzer analyzer=new StandardAnalyzer(Version.LUCENE_36);

try {

QueryParser qp=new QueryParser(Version.LUCENE_36,"body",analyzer);//用于解析用户输入的工具

query=qp.parse(queryString);

} catch (Exception e) {

// TODO: handle exception

}

if (searcher!=null) {

TopDocs results=searcher.search(query, 10);//只取排名前十的搜索结果

hits=results.scoreDocs;

Document document=null;

if (hits.length>0) {

System.out.println("找到"+hits.length+"条结果");

for (int i = 0; i < hits.length; i++) {

document=searcher.doc(hits[i].doc);

String body=document.get("body");

String path=document.get("path");

String modifiedtime=document.get("modifiField");

System.out.print(body+" ");

System.out.println(path);

}

}else

System.out.println("没查到结果");

searcher.close();

reader.close();

}else

System.out.println("没查找到索引");

}

Test2检索相对路径上的文件

package test2;

import java.io.*;

import java.util.HashMap;

import java.util.Iterator;

import java.util.Set;

import org.apache.lucene.analysis.Analyzer;

import org.apache.lucene.document.Document;

import org.apache.lucene.document.Field;

import org.apache.lucene.index.IndexReader;

import org.apache.lucene.index.IndexWriter;

import org.apache.lucene.index.IndexWriterConfig;

import org.apache.lucene.index.IndexWriterConfig.OpenMode;

import org.apache.lucene.search.IndexSearcher;

import org.apache.lucene.search.Query;

import org.apache.lucene.search.ScoreDoc;

import org.apache.lucene.search.TopScoreDocCollector;

import org.apache.lucene.store.FSDirectory;

import org.apache.lucene.util.Version;

import org.wltea.analyzer.lucene.IKAnalyzer;

import org.wltea.analyzer.lucene.IKQueryParser;

public class MyLucene {

private static final File INDEX_PATH = new File(".\\index\\test2"); // 索引文件位置, 当前路径下的index文件

private static final String filePath = ".\\luceneDataSource\\test.txt";// 索引数据源文件位置，当前路径下的luceneDataSource\test.txt文件

private static final Analyzer ANALYZER = new IKAnalyzer(); // 中文分词器

public static void main(String[] args){

/**

* 创建索引

File readFile = new File(filePath); // 获取数据源文件

HashMap<String, String> words = readFile(readFile);

Document doc = null;

if (words != null) {

try {

IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_36, ANALYZER);

iwc.setOpenMode(OpenMode.CREATE);//创建新的索引文件create 表示创建或追加到已有索引库，没有这句话索引库会有重复的

IndexWriter writer = new IndexWriter(FSDirectory.open(INDEX_PATH), iwc);

Set<String> keys = words.keySet();

for (Iterator<String> it = keys.iterator(); it.hasNext();) {

String key = it.next();

doc = new Document();

Field index = new Field("index", key, Field.Store.YES,Field.Index.ANALYZED,Field.TermVector.WITH_POSITIONS_OFFSETS);

Field contents = new Field("contents", words.get(key),Field.Store.YES, Field.Index.NO);

doc.add(index);

doc.add(contents);

writer.addDocument(doc);

}

writer.close(); // 这里不关闭，建立索引会失败

} catch (Exception e) {

e.printStackTrace();

}

else

System.out.println("文件读取错误");

}

/**

* 判断索引库是已否创建

public boolean noIndex() {

File[] indexs = INDEX_PATH.listFiles();

if (indexs.length == 0) {

return true;

} else {

return false;

}

/**

* 读取文件

* @param file

public static HashMap<String, String> readFile(File file) {

InputStream in = null;

InputStreamReader inR = null;

BufferedReader br = null;

HashMap<String, String> wordsMap = new HashMap<String, String>();

try {

in = new FileInputStream(file);

inR = new InputStreamReader(in, "GBK"); //utf-8

br = new BufferedReader(inR);

String line;

while ((line = br.readLine()) != null) {

System.out.println(line);

wordsMap.put(line.trim(), line.trim());

}

return wordsMap;

} catch (Exception e) {

e.printStackTrace();

return null;

} finally {

try {

if (in != null)

in.close();

if (inR != null)

inR.close();

if (br != null)

br.close();

} catch (Exception e) {

e.printStackTrace();

return null;

}

/**

* 检索

* @param queryStr

* @param hitsPerPage

public void search(String queryStr) {

try {

IndexReader reader = IndexReader.open(FSDirectory.open(INDEX_PATH));// 得到索引的目录

IndexSearcher searcher = new IndexSearcher(reader);

Query query = IKQueryParser.parse("index", queryStr);

TopScoreDocCollector collector = TopScoreDocCollector.create(100, true);

searcher.search(query, collector);

ScoreDoc[] hits = collector.topDocs().scoreDocs;

if(hits.length > 0){

System.out.println("检索词："+queryStr+"\t共找到 "+hits.length+"条记录");

for (int i = 0; i < hits.length; i++) {

Document result = searcher.doc(hits[i].doc);

System.out.println((i+1) +")" + "\n index:" + result.get("index") + "\n contents:" + result.get("contents"));

}

}else{

System.out.println("未找到结果");

}

} catch (Exception e) {

System.out.println("Exception");

}

package test2;

import java.io.BufferedReader;

import java.io.IOException;

import java.io.InputStreamReader;

public class TestMyLucene {

public static void main(String[] args) throws IOException {

MyLucene myLucene = new MyLucene();

// 索引库是已否创建,如果没有则创建

if(myLucene.noIndex()){

System.out.println("索引库还没有创建");

}else{

BufferedReader reader1=new BufferedReader(new InputStreamReader(System.in));

String queryString=reader1.readLine().toString(); //搜索关键字

myLucene.search(queryString);

}

Test3检索数据库中的数据（本例为oracle）

package test3;

import java.io.File;

import java.io.IOException;

import java.sql.Connection;

import java.sql.ResultSet;

import java.sql.SQLException;

import java.sql.Statement;

import java.text.DateFormat;

import java.text.SimpleDateFormat;

import java.util.ArrayList;

import java.util.List;

import org.apache.lucene.analysis.Analyzer;

import org.apache.lucene.document.Document;

import org.apache.lucene.document.Field;

import org.apache.lucene.index.IndexWriter;

import org.apache.lucene.index.IndexWriterConfig;

import org.apache.lucene.index.IndexWriterConfig.OpenMode;

import org.apache.lucene.store.Directory;

import org.apache.lucene.store.FSDirectory;

import org.apache.lucene.util.Version;

import org.wltea.analyzer.lucene.IKAnalyzer;

public class IndexCreateUtill {

private List<NewsItem> list;

public void createIndexForMynews() throws IOException, ClassNotFoundException{

//存放索引的文件夹

File indxeFile = new File(".\\index\\test3");

//创建Directory对象

Directory directory =FSDirectory.open(indxeFile);

//使用IKAnalyzer分词器

Analyzer analyzer = new IKAnalyzer();

//创建IndexWriterConfig

IndexWriterConfig indexWriterConfig = new IndexWriterConfig(Version.LUCENE_36, analyzer);

//创建IndexWriter

indexWriterConfig.setOpenMode(OpenMode.CREATE);

IndexWriter indexWriter = new IndexWriter(directory, indexWriterConfig);

//从数据库中读取出所有的新闻记录以便进行索引的创建

try {

// DBSource dbSource =DBSource.getInstance();

// Connection conn = dbSource.getConnection();

Connection conn=Utils.getConnection();

Statement stmt = null;

ResultSet rs = null;

String sql = "select * from t_newsitem";

stmt = conn.createStatement();

rs = stmt.executeQuery(sql);

list = new ArrayList<NewsItem>();

while(rs.next()){

NewsItem newsItem = new NewsItem();

newsItem.setId(rs.getInt("id"));

newsItem.setNewsTitle(rs.getString("newsTitle"));

newsItem.setNewsContent(rs.getString("newsContent"));

newsItem.setPublishTime(rs.getTimestamp("publishTime"));

newsItem.setResource(rs.getString("resourcer"));

newsItem.setT_newsType_id(rs.getInt("t_newsType_id"));

newsItem.setEditor(rs.getString("editor"));

list.add(newsItem);

}

DateFormat dateFormat = new SimpleDateFormat("yyyy年MM月dd日 HH时mm分ss秒");

for (int i=0;i<list.size();i++) {

//建立一个lucene文档

Document doc = new Document();

//得到新闻标题

String newsTitle = list.get(i).getNewsTitle();

//得到新闻内容

String newsContent = list.get(i).getNewsContent();

//得到新闻事件

String publishDate = dateFormat.format(list.get(i).getPublishTime());

//得到新闻主键id

String id = list.get(i).getId() + "";

//将新闻标题加入文档，因为要搜索和高亮，所以index是tokennized，TermVector是WITH_POSITIONS_OFFSETS

doc.add(new Field("title" , newsTitle , Field.Store.YES , Field.Index.ANALYZED , Field.TermVector.WITH_POSITIONS_OFFSETS));

//添加新闻内容至文档，与标题相似

doc.add(new Field("content" , newsContent , Field.Store.YES , Field.Index.ANALYZED , Field.TermVector.WITH_POSITIONS_OFFSETS));

//添加时间至文档，因为要按照此字段降序排列排序，所以tokenzied,不用高亮所以TermVector是no就行了

doc.add(new Field("date" , publishDate , Field.Store.YES , Field.Index.ANALYZED , Field.TermVector.NO));

//添加主键至文档，不分词，不高亮。

doc.add(new Field("id" , id , Field.Store.YES , Field.Index.NO , Field.TermVector.NO));

indexWriter.addDocument(doc);

}

indexWriter.close();

Utils.closeAll(rs, stmt, conn);

} catch (SQLException e) {

// TODO Auto-generated catch block

e.printStackTrace();

}

public static void main(String[] args) throws Exception {

IndexCreateUtill util = new IndexCreateUtill();

util.createIndexForMynews();

}

package test3;

import java.io.Serializable;

import java.util.Date;

public class NewsItem implements Serializable{

private static final long serialVersionUID = 1L;

private Integer id ;

private String newsTitle ;

private String newsContent;

private Date publishTime;

private String resource;

private Integer t_newsType_id;

private String editor;

public NewsItem() {

}

public NewsItem(Integer id, String newsTitle, String newsContent,

Date publishTime, String resource, Integer t_newsType_id, String editor) {

super();

this.id = id;

this.newsTitle = newsTitle;

this.newsContent = newsContent;

this.publishTime = publishTime;

this.resource = resource;

this.t_newsType_id = t_newsType_id;

this.editor = editor;

}

public Integer getId() {

return id;

}

public void setId(Integer id) {

this.id = id;

}

public String getNewsTitle() {

return newsTitle;

}

public void setNewsTitle(String newsTitle) {

this.newsTitle = newsTitle;

}

public String getNewsContent() {

return newsContent;

}

public void setNewsContent(String newsContent) {

this.newsContent = newsContent;

}

public Date getPublishTime() {

return publishTime;

}

public void setPublishTime(Date publishTime) {

this.publishTime = publishTime;

}

public String getResource() {

return resource;

}

public void setResource(String resource) {

this.resource = resource;

}

public Integer getT_newsType_id() {

return t_newsType_id;

}

public void setT_newsType_id(Integer t_newsType_id) {

this.t_newsType_id = t_newsType_id;

}

public String getEditor() {

return editor;

}

public void setEditor(String editor) {

this.editor = editor;

}

package test3;

import java.io.File;

import java.io.IOException;

import java.util.Scanner;

import org.apache.lucene.analysis.Analyzer;

import org.apache.lucene.analysis.standard.StandardAnalyzer;

import org.apache.lucene.document.Document;

import org.apache.lucene.index.IndexReader;

import org.apache.lucene.queryParser.ParseException;

import org.apache.lucene.queryParser.QueryParser;

import org.apache.lucene.search.IndexSearcher;

import org.apache.lucene.search.Query;

import org.apache.lucene.search.ScoreDoc;

import org.apache.lucene.search.TopDocs;

import org.apache.lucene.store.FSDirectory;

import org.apache.lucene.util.Version;

public class TestQuery {

public static void main(String[] args) throws ParseException, IOException {

String index=".\\index\\test3";//搜索的索引路径

IndexReader reader=IndexReader.open(FSDirectory.open(new File(index)));

IndexSearcher searcher=new IndexSearcher(reader);//检索工具

ScoreDoc[] hits=null;

// BufferedReader reader1=new BufferedReader(new InputStreamReader(System.in));

// String queryString=reader1.readLine().toString(); //搜索关键字

Scanner sca=new Scanner(System.in);

String queryString=sca.next().toString();

System.out.print("搜索关键词为"+queryString+",");

Query query=null;

Analyzer analyzer=new StandardAnalyzer(Version.LUCENE_36);

try {

QueryParser qp=new QueryParser(Version.LUCENE_36,"content",analyzer);//用于解析用户输入的工具

query=qp.parse(queryString);

} catch (Exception e) {

// TODO: handle exception

}

if (searcher!=null) {

TopDocs results=searcher.search(query, 10);//只取排名前十的搜索结果

hits=results.scoreDocs;

Document document=null;

if (hits.length>0) {

System.out.println("找到"+hits.length+"条结果");

for (int i = 0; i < hits.length; i++) {

document=searcher.doc(hits[i].doc);

String title=document.get("title");

String content=document.get("content");

String date=document.get("date");

String id=document.get("id");

System.out.println("标题："+title);

System.out.println("内容："+content);

System.out.println("日期："+date);

System.out.println("ID:"+id);

}

}else

System.out.println("没查到结果");

searcher.close();

reader.close();

}else

System.out.println("没查找到索引");

}

package test3;

import java.sql.Connection;

import java.sql.DriverManager;

import java.sql.ResultSet;

import java.sql.SQLException;

import java.sql.Statement;

public class Utils {

public static Connection getConnection() {

Connection con = null;

try {

Class.forName("oracle.jdbc.driver.OracleDriver");

con = DriverManager.getConnection("jdbc:oracle:thin:@localhost :1521:orcl", "hr", "orcl");

}catch (ClassNotFoundException e) {

// TODO Auto-generated catch block

e.printStackTrace();

}

catch (SQLException e) {

// TODO Auto-generated catch block

e.printStackTrace();

}

return con;

}

public static void closeAll(ResultSet rs, Statement ps,Connection conn) throws SQLException{

closeResultSet(rs);

closeStatement(ps);

closeConnection(conn);

}

public static void closeConnection(Connection con) {

try {

if (con != null) {

con.close();

}

}catch (SQLException ex) {

ex.printStackTrace();

}

public static void closeStatement(Statement st) {

try {

if (st != null) {

st.close();

}

}catch (SQLException ex) {

ex.printStackTrace();

}

public static void closeResultSet(ResultSet rs) {

try {

if (rs != null) {

rs.close();

}

}catch (SQLException ex) {

ex.printStackTrace();

}

转载于:https://my.oschina.net/u/1433614/blog/189885

weixin_34355559

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
复制链接

分享到 QQ

分享到新浪微博

扫一扫