lucene study notes

最新推荐文章于 2023-01-10 18:27:01 发布

bill1973

最新推荐文章于 2023-01-10 18:27:01 发布

阅读量591

点赞数

分类专栏： cs 文章标签： lucene import null file class query

本文链接：https://blog.csdn.net/bill1973/article/details/1860796

版权

cs 专栏收录该内容

42 篇文章 0 订阅

订阅专栏

create index

package org.apache.lucene.demo;

import org.apache.lucene.analysis.standard.StandardAnalyzer;

import org.apache.lucene.index.IndexWriter;

import java.io.File;

import java.io.FileNotFoundException;

import java.io.IOException;

import java.util.Date;

/** Index all text files under a directory. */

public class IndexFiles {

private IndexFiles() {}

static final File INDEX_DIR = new File("index");

/** Index all text files under a directory. */

public static void main(String[] args) {

String usage = "java org.apache.lucene.demo.IndexFiles <root_directory>";

if (args.length == 0) {

System.err.println("Usage: " + usage);

System.exit(1);

}

if (INDEX_DIR.exists()) {

System.out.println("Cannot save index to '" +INDEX_DIR+ "' directory, please delete it first");

System.exit(1);

}

final File docDir = new File(args[0]);

if (!docDir.exists() || !docDir.canRead()) {

System.out.println("Document directory '" +docDir.getAbsolutePath()+ "' does not exist or is not readable, please check the path");

System.exit(1);

}

Date start = new Date();

try {

IndexWriter writer = new IndexWriter(INDEX_DIR, new StandardAnalyzer(), true); // step 1

System.out.println("Indexing to directory '" +INDEX_DIR+ "'...");

indexDocs(writer, docDir); // look here

System.out.println("Optimizing...");

writer.optimize();

writer.close();

Date end = new Date();

System.out.println(end.getTime() - start.getTime() + " total milliseconds");

} catch (IOException e) {

System.out.println(" caught a " + e.getClass() +

" with message: " + e.getMessage());

}

static void indexDocs(IndexWriter writer, File file)

throws IOException {

// do not try to index files that cannot be read

if (file.canRead()) {

if (file.isDirectory()) {

String[] files = file.list();

// an IO error could occur

if (files != null) {

for (int i = 0; i < files.length; i++) {

indexDocs(writer, new File(file, files[i]));

}

} else {

System.out.println("adding " + file);

try {

writer.addDocument(FileDocument.Document(file)); //step 2

}

// at least on windows, some temporary files raise this exception with an "access denied" message

// checking if the file can be read doesn't help

catch (FileNotFoundException fnfe) {

;

}

like someone said lucene is an api. the key issue here is:
1. create IndexWriter
IndexWriter writer = new IndexWriter(INDEX_DIR, new StandardAnalyzer(), true);
2. make index for a file
writer.addDocument(FileDocument.Document(file));

search index

main step: Hits hits = searcher.search(query); [step 6]
help class
1.IndexReader; step1 and 2 to create Searcher object
[ a easy way: IndexSearcher searcher = new IndexSearcher(dirStr); ]

2.QueryParser, step 3,4 and 5to create query using parse() method

Hit class carrys search result infomation

package org.apache.lucene.demo;

import org.apache.lucene.analysis.Analyzer;

import org.apache.lucene.analysis.standard.StandardAnalyzer;

import org.apache.lucene.document.Document;

import org.apache.lucene.index.FilterIndexReader;

import org.apache.lucene.index.IndexReader;

import org.apache.lucene.queryParser.QueryParser;

import org.apache.lucene.search.Hits;

import org.apache.lucene.search.IndexSearcher;

import org.apache.lucene.search.Query;

import org.apache.lucene.search.Searcher;

import java.io.BufferedReader;

import java.io.FileReader;

import java.io.IOException;

import java.io.InputStreamReader;

import java.util.Date;

/** Simple command-line based search demo. */

public class SearchFiles {

/** Use the norms from one field for all fields. Norms are read into memory,

* using a byte of memory per document per searched field. This can cause

* search of large collections with a large number of fields to run out of

* memory. If all of the fields contain only a single token, then the norms

* are all identical, then single norm vector may be shared. */

private static class OneNormsReader extends FilterIndexReader {

private String field;

public OneNormsReader(IndexReader in, String field) {

super(in);

this.field = field;

}

public byte[] norms(String field) throws IOException {

return in.norms(this.field);

}

private SearchFiles() {}

/** Simple command-line based search demo. */

public static void main(String[] args) throws Exception {

String usage =

"Usage: java org.apache.lucene.demo.SearchFiles [-index dir] [-field f] [-repeat n] [-queries file] [-raw] [-norms field]";

if (args.length > 0 && ("-h".equals(args[0]) || "-help".equals(args[0]))) {

System.out.println(usage);

System.exit(0);

}

String index = "index";

String field = "contents";

String queries = null;

int repeat = 0;

boolean raw = false;

String normsField = null;

for (int i = 0; i < args.length; i++) {

if ("-index".equals(args[i])) {

index = args[i+1];

i++;

} else if ("-field".equals(args[i])) {

field = args[i+1];

i++;

} else if ("-queries".equals(args[i])) {

queries = args[i+1];

i++;

} else if ("-repeat".equals(args[i])) {

repeat = Integer.parseInt(args[i+1]);

i++;

} else if ("-raw".equals(args[i])) {

raw = true;

} else if ("-norms".equals(args[i])) {

normsField = args[i+1];

i++;

}

IndexReader reader = IndexReader.open(index); //step 1

if (normsField != null)

reader = new OneNormsReader(reader, normsField);

Searcher searcher = new IndexSearcher(reader); //step 2

Analyzer analyzer = new StandardAnalyzer(); //step 3

BufferedReader in = null;

if (queries != null) {

in = new BufferedReader(new FileReader(queries));

} else {

in = new BufferedReader(new InputStreamReader(System.in, "UTF-8"));

}

QueryParser parser = new QueryParser(field, analyzer); //step 4

while (true) {

if (queries == null) // prompt the user

System.out.println("Enter query: ");

String line = in.readLine();

if (line == null || line.length() == -1)

break;

line = line.trim();

if (line.length() == 0)

break;

Query query = parser.parse(line); //step 5

System.out.println("Searching for: " + query.toString(field));

Hits hits = searcher.search(query); // step 6 key issue

if (repeat > 0) { // repeat & time as benchmark

Date start = new Date();

for (int i = 0; i < repeat; i++) {

hits = searcher.search(query);

}

Date end = new Date();

System.out.println("Time: "+(end.getTime()-start.getTime())+"ms");

}

System.out.println(hits.length() + " total matching documents");

final int HITS_PER_PAGE = 10;

for (int start = 0; start < hits.length(); start += HITS_PER_PAGE) {

int end = Math.min(hits.length(), start + HITS_PER_PAGE);

for (int i = start; i < end; i++) {

if (raw) { // output raw format

System.out.println("doc="+hits.id(i)+" score="+hits.score(i));

continue;

}

Document doc = hits.doc(i);

String path = doc.get("path");

if (path != null) {

System.out.println((i+1) + ". " + path);

String title = doc.get("title");

if (title != null) {

System.out.println(" Title: " + doc.get("title"));

}

} else {

System.out.println((i+1) + ". " + "No path for this document");

}

if (queries != null) // non-interactive

break;

if (hits.length() > end) {

System.out.println("more (y/n) ? ");

line = in.readLine();

if (line.length() == 0 || line.charAt(0) == 'n')

break;

}

reader.close();

}

Removing Documents from an index

main step: IndexReader.deleteDocument(id)

String indexDir = ...

dir = FSDirectory.getDirectory(indexDir);

IndexReader reader = IndexReader.open(dir);

int id = ...

reader.deleteDocument(id);

reader.close();

bill1973

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
lucene study notes

create index package org.apache.lucene.demo;import org.apache.lucene.analysis.standard.StandardAnalyzer;import org.apache.lucene.index.IndexWriter;import java.io.File;import java.io.FileNotFoundE
复制链接

扫一扫