6. 测试的主程序
规则:
加粗体的黑色代码,表示将作深入分析
try {
Directory directory = new RAMDirectory();
Analyzer analyzer = new SimpleAnalyzer();
IndexWriter writer = new IndexWriter(directory, analyzer, true);
String[] docs = {
"a b c d e",
"a b c d e a b c d e",
"a b c d e f g h i j",
"a c e",
"e c a",
"a c e a c e",
"a c e a b c"
};
for (int j = 0; j < docs.length; j++) {
Document d = new Document();
d.add(Field.Text("contents", docs[j]));
writer.addDocument(d);
}
writer.close();
以上代码是准备工作,生成索引
Searcher searcher = new IndexSearcher(directory);
以上代码,初始化查询,分析编号 1 。 1
String[] queries = {"/"a c e/"",
};
Hits hits = null;
QueryParser parser = new QueryParser("contents", analyzer);
parser.setPhraseSlop(0);
for (int j = 0; j < queries.length; j++) {
Query query = parser.parse(queries[j]);
该 Query = PhraseQuery
System.out.println("Query: " + query.toString("contents"));
hits = searcher.search(query);
以上代码,初始化查询,分析编号 1 。 2
System.out.println(hits.length() + " total results");
for (int i = 0 ; i < hits.length() && i < 10; i++) {
Document d = hits.doc(i);
System.out.println(i + " " + hits.score(i)
// + " " + DateField.stringToDate(d.get("modified"))
+ " " + d.get("contents"));
}
}
searcher.close();
} catch (Exception e) {
System.out.println(" caught a " + e.getClass() +
"/n with message: " + e.getMessage());
}
查询结果:
Query: "a c e"
3 total results
0 1.0 a c e a c e
1 0.9428091 a c e
2 0.7071068 a c e a b c
1.1. Searcher searcher = new IndexSearcher(directory)
1.1.1. 初始化
通过目录,创建一个索引搜索器,
调用类
IndexSearcher :: public IndexSearcher(Directory directory) throws IOException {
this( IndexReader.open(directory) , true);
}
调用
private IndexSearcher(IndexReader r, boolean closeReader) {
reader = r;
this.closeReader = closeReader;
}
调用
private static IndexReader open(final Directory directory, final boolean closeDirectory) throws IOException {
synchronized (directory) { // in- & inter-process sync
return (IndexReader)new Lock.With(
directory.makeLock(IndexWriter.COMMIT_LOCK_NAME),
IndexWriter.COMMIT_LOCK_TIMEOUT) {
public Object doBody() throws IOException {
SegmentInfos infos = new SegmentInfos();
从目录中读取 SegmentInfos
infos.read(directory);
if (infos.size() == 1) { // index is optimized
return new SegmentReader(infos, infos.info(0), closeDirectory);
} else {
IndexReader[] readers = new IndexReader[infos.size()];
for (int i = 0; i < infos.size(); i++)
readers[i] = new SegmentReader(infos.info(i));
return new MultiReader(directory, infos, closeDirectory, readers);
}
}
}.run();
}
}
代码到这里,已经读取了文件 segments 文件,获得段信息,该测试只有一个段,所以执行了 return new SegmentReader(infos, infos.info(0), closeDirectory); ,记住 IndexReader = SegmentReader
infos.read(directory):
/** 读取输入参数的目录,下的 segments 文件
* 代码分析:
* 1 。读取格式,小于 0 表示该文件有隐含的格式信息,小于- 1 就表示该格式是未知的,因为最小的格式是- 1
* 2 。小于 0 时,再读取版本信息以及段的计数
* 3 。大于 0 ,表示 segments 文件开头部分没有版本信息,只有段的计数
* 4 。读取段的数量
* 5 。循环读取段信息,然后构建段信息对象,最后把这些对象都加入到段集合中
* 6 。大于 0 时,判断是否文件最后有版本信息,有的话就赋值 version ,没有的话, version = 0 */ , 该段代码比较简单,读者可以从看 src 中代码
return new SegmentReader(infos, infos.info(0), closeDirectory);
SegmentReader(SegmentInfos sis, SegmentInfo si, boolean closeDir)
throws IOException {
super(si.dir, sis, closeDir);
initialize(si);
}
super(si.dir, sis, closeDir);
IndexReader :: IndexReader(Directory directory, SegmentInfos segmentInfos, boolean closeDirectory) {
this.directory = directory;
this.segmentInfos = segmentInfos;
directoryOwner = true;
this.closeDirectory = closeDirectory;
stale = false;
hasChanges = false;
writeLock = null;
}
SegmentReader :: initialize(si);
/** 初始化这个段信息
该段代码是初始化了
* 1 。读入域信息,只有域的名字
* 2. 打开保存域、保存域索引的文件
*/
private void initialize(SegmentInfo si) throws IOException
{
segment = si.name;
// Use compound file directory for some files, if it exists
Directory cfsDir = directory();// 就是保存该段的目录
// CompoundFileReader (组合文件读取器)也是 ( 目录 ) 的子类
if (directory().fileExists(segment + ".cfs")) {
cfsReader = new CompoundFileReader(directory(), segment + ".cfs");
cfsDir = cfsReader;
}
// 1 。读入域信息,只有域的名字
fieldInfos = new FieldInfos(cfsDir, segment + ".fnm"); // 这个过程读入所有的域信息了
// 2 。打开保存域、保存域索引的文件
fieldsReader = new FieldsReader(cfsDir, segment, fieldInfos);
tis = new TermInfosReader(cfsDir, segment, fieldInfos);
if (hasDeletions(si))
deletedDocs = new BitVector(directory(), segment + ".del");// 读入删除表
freqStream = cfsDir.openFile(segment + ".frq");// 读入频率文件
proxStream = cfsDir.openFile(segment + ".prx");// 读入 位置文件
openNorms(cfsDir);// 读入文件 segment.f1,segment.f2 ……,建立 hashtable
if (fieldInfos.hasVectors()) { // open term vector files only as needed
termVectorsReader = new TermVectorsReader(cfsDir, segment, fieldInfos);
}
}
1.2. hits = searcher.search(query);
这时, searcher = IndexSearcher , 对该代码的跟踪如下:
调用: return search(query, (Filter)null)
调用: return new Hits(this, query, filter);
调用: Hit :: Hits(Searcher s, Query q, Filter f) throws IOException {
query = q;
searcher = s;
filter = f;
getMoreDocs(50); // retrieve 100 initially
}
getMoreDocs(int min) 调用:: TopDocs topDocs = searcher.search(query, filter, n)
searcher.search(query, filter, n) 调用 Scorer scorer = query.weight(this).scorer(reader);
IndexSearcher :: public TopDocs search(Query query, Filter filter, final int nDocs)
throws IOException {
Scorer scorer = query.weight(this).scorer(reader);
if (scorer == null)
return new TopDocs(0, new ScoreDoc[0]);
final BitSet bits = filter != null ? filter.bits(reader) : null;
final HitQueue hq = new HitQueue(nDocs);
final int[] totalHits = new int[1];
scorer.score(new HitCollector() {
private float minScore = 0.0f ;
public final void collect(int doc, float score) {
if (score > 0.0f && // ignore zeroed buckets
(bits==null || bits.get(doc))) { // skip docs not in bits
totalHits[0]++;
if (hq.size() < nDocs || score >= minScore) {
hq.insert(new ScoreDoc(doc, score));
minScore = ((ScoreDoc)hq.top()).score; // maintain minScore
}
}
}
});
ScoreDoc[] scoreDocs = new ScoreDoc[hq.size()];
for (int i = hq.size()-1; i >= 0; i--) // put docs in array
scoreDocs[i] = (ScoreDoc)hq.pop();
return new TopDocs(totalHits[0], scoreDocs);
}
1.2.1. Scorer scorer = query.weight(this).scorer(reader);
参数分析: query = PhraseQuery (该参数由主测试程序中的 Query query = parser.parse(queries[j]); 初始化)
this = IndexSearcher (该参数初始化,已经初始化了主要的文件,具体可参考 1.1 )
由代码
1 PhraseQuery ::
protected Weight createWeight(Searcher searcher) {
if (terms.size() == 1) { // optimize one-term case
Term term = (Term)terms.elementAt(0);
Query termQuery = new TermQuery(term);
termQuery.setBoost(getBoost());
return termQuery.createWeight(searcher);
}
return new PhraseWeight(searcher);
}
query.weight(this) 创建了 PhraseWeight(searcher)
Scorer scorer = query.weight(this).scorer(reader) 就相当于 PhraseWeight(searcher). .scorer(reader), 即调用以下代码:
2 PhraseQuery ::
public Scorer scorer(IndexReader reader) throws IOException {
if (terms.size() == 0) // optimize zero-term case
return null;
// 读取项的 位置信息
TermPositions[] tps = new TermPositions[terms.size()];
for (int i = 0; i < terms.size(); i++) {
TermPositions p = reader.termPositions((Term)terms.elementAt(i));
if (p == null)
return null;
tps[i] = p;
}
得到所有项的项信息, TermPositions[ ] = SegmentTermPositions[ ]
if (slop == 0) // optimize exact case
return new ExactPhraseScorer(this, tps, getPositions(), getSimilarity(searcher) ,
reader.norms(field));
}
ü TermPositions p = reader.termPositions((Term)terms.elementAt(i));
这时 Term 文本为查询里的项
public TermPositions termPositions(Term term) throws IOException {
TermPositions termPositions = termPositions();
termPositions.seek(term);
return termPositions;
}
termPositions() ::
SegmentReader :: public final TermPositions termPositions() throws IOException {
return new SegmentTermPositions(this);
}
parent = SegmentReader, 即刚才的段读取器
tis = new TermInfosReader(cfsDir, segment, fieldInfos); 即项信息读取器
SegmentTermPositions(this) ::
SegmentTermPositions :: SegmentTermPositions(SegmentReader p) throws IOException {
super(p);
this.proxStream = (InputStream)parent.proxStream.clone();
}
super(p) ::
SegmentTermDocs(SegmentReader parent)
throws IOException {
this.parent = parent;
this.freqStream = (InputStream) parent.freqStream.clone();
this.deletedDocs = parent.deletedDocs;
this.skipInterval = parent.tis.getSkipInterval();
}
termPositions.seek(term);
public void seek(Term term) throws IOException {
根据项,从项信息读取器中读取对应的项信息,该方法是线程安全的
TermInfo ti = parent.tis.get(term);
seek(ti);
}
seek(TermInfo ti)
SegmentTermDocs 的项信息转变为现在读入的项的信息
void seek(TermInfo ti) throws IOException {
count = 0;
if (ti == null) {
df = 0;
} else {
df = ti.docFreq;
doc = 0;
skipDoc = 0;
skipCount = 0;
numSkips = df / skipInterval;
freqPointer = ti.freqPointer;
proxPointer = ti.proxPointer;
skipPointer = freqPointer + ti.skipOffset;
freqStream.seek(freqPointer);
haveSkipped = false;
}
}
new ExactPhraseScorer(this, tps, getPositions(), getSimilarity(searcher) , reader.norms(field));
调用构造器
ExactPhraseScorer(Weight weight, TermPositions[] tps, int[] positions, Similarity similarity,
byte[] norms) throws IOException {
super(weight, tps, positions, similarity, norms);
调用超类构造器,获得短语位置的频繁度信息和位置信息,并构造一个优先队列
PhraseScorer(Weight weight, TermPositions[] tps, int[] positions, Similarity similarity,
byte[] norms) {
super(similarity);
this.norms = norms;
this.weight = weight;
this.value = weight.getValue();
// convert tps to a list
// 把 PhrasePositions 放在一个一般的队列里面(以链表形式)
for (int i = 0; i < tps.length; i++) {
PhrasePositions pp = new PhrasePositions(tps[i], positions[i]);
if (last != null) { // add next to end of list
last.next = pp;
} else
first = pp;
last = pp;
}
pq = new PhraseQueue(tps.length); // construct empty pq
}
使用该记分器记分,并收集
scorer.score(new HitCollector()
public void score(HitCollector hc) throws IOException {
while (next()) {
hc.collect(doc(), score());
}
}
hc.collect(doc(), score());
score() 调用, value 为权值
PhraseScorer :: public float score() throws IOException {
//System.out.println("scoring " + first.doc);
float raw = getSimilarity().tf(freq) * value; // raw score
return raw * Similarity.decodeNorm(norms[first.doc]); // normalize
}
把各个位置的文档和得分收集
public final void collect(int doc, float score) {
if (score > 0.0f && // ignore zeroed buckets
(bits==null || bits.get(doc))) { // skip docs not in bits
totalHits[0]++;
if (hq.size() < nDocs || score >= minScore) {
hq.insert(new ScoreDoc(doc, score));
minScore = ((ScoreDoc)hq.top()).score; // maintain minScore
}
}
}
到这里就出来了查询的文档和分数,并且这些文档和分数经过了指定的排序和过滤