需要解决的问题:
ExtRanker_T为什么会有关键字信息?
以及他的构造函数做了什么?
pRanker = new ExtRanker_T < RankerState_ProximityBM25Exact_fn > ( tXQ, tTermSetup );
tXQ其实是一颗关键字的树,里边包含搜索关键字的方法。所以在构造ExtRanker_T的时候,打分排序类就包含了关键字的信息。
ExtQwordsHash_t hQwords;
pRanker->GetQwords ( hQwords );
然后又把关键字的一些个信息赋给hQwords,不明白是为了什么?难道这样方便处理?
pRanker的实例,
ExtRanker_T<STATE> ( constXQQuery_t & tXQ, constISphQwordSetup & tSetup ) :ExtRanker_c ( tXQ, tSetup ) {}
通过上边可以发现,构造函数是ExtRanker_c的构造函数
ExtRanker_c::ExtRanker_c ( const XQQuery_t & tXQ, const ISphQwordSetup & tSetup )
{
assert ( tSetup.m_pCtx );
m_iInlineRowitems = tSetup.m_iInlineRowitems;
for ( int i=0; i<ExtNode_i::MAX_DOCS; i++ )
{
m_dMatches[i].Reset ( tSetup.m_iDynamicRowitems );
m_dMyMatches[i].Reset ( tSetup.m_iDynamicRowitems );
}
m_tTestMatch.Reset ( tSetup.m_iDynamicRowitems );
assert ( tXQ.m_pRoot );
tSetup.m_pZoneChecker = this;
m_pRoot = ExtNode_i::Create ( tXQ.m_pRoot, tSetup );
#if SPH_TREE_DUMP
if ( m_pRoot )
m_pRoot->DebugDump(0);
#endif
m_pDoclist = NULL;
m_pHitlist = NULL;
m_uMaxID = 0;
m_uPayloadMask = 0;
m_iQwords = 0;
m_pIndex = tSetup.m_pIndex;
m_pCtx = tSetup.m_pCtx;
m_dZones = tXQ.m_dZones;
m_dZoneStart.Resize ( m_dZones.GetLength() );
m_dZoneEnd.Resize ( m_dZones.GetLength() );
m_dZoneMax.Resize ( m_dZones.GetLength() );
m_dZoneMin.Resize ( m_dZones.GetLength() );
m_dZoneMax.Fill ( 0 );
m_dZoneMin.Fill ( DOCID_MAX );
ARRAY_FOREACH ( i, m_dZones )
{
XQKeyword_t tDot;
tDot.m_sWord.SetSprintf ( "%c%s", MAGIC_CODE_ZONE, m_dZones[i].cstr() );
m_dZoneStartTerm.Add ( new ExtTerm_c ( CreateQueryWord ( tDot, tSetup ), tSetup ) );
m_dZoneStart[i] = NULL;
tDot.m_sWord.SetSprintf ( "%c/%s", MAGIC_CODE_ZONE, m_dZones[i].cstr() );
m_dZoneEndTerm.Add ( new ExtTerm_c ( CreateQueryWord ( tDot, tSetup ), tSetup ) );
m_dZoneEnd[i] = NULL;
}
}
ExtRanker_c::~ExtRanker_c ()
{
SafeDelete ( m_pRoot );
ARRAY_FOREACH ( i, m_dZones )
{
SafeDelete ( m_dZoneStartTerm[i] );
SafeDelete ( m_dZoneEndTerm[i] );
}
}
void ExtRanker_c::Reset ( const ISphQwordSetup & tSetup )
{
if ( m_pRoot )
m_pRoot->Reset ( tSetup );
ARRAY_FOREACH ( i, m_dZones )
{
m_dZoneStartTerm[i]->Reset ( tSetup );
m_dZoneEndTerm[i]->Reset ( tSetup );
m_dZoneStart[i] = NULL;
m_dZoneEnd[i] = NULL;
}
m_dZoneMax.Fill ( 0 );
m_dZoneMin.Fill ( DOCID_MAX );
m_hZoneInfo.Reset();
}
const ExtDoc_t * ExtRanker_c::GetFilteredDocs ()
{
for ( ;; )
{
// get another chunk
m_uMaxID = 0;
const ExtDoc_t * pCand = m_pRoot->GetDocsChunk ( &m_uMaxID );
if ( !pCand )
return NULL;
// create matches, and filter them
int iDocs = 0;
while ( pCand->m_uDocid!=DOCID_MAX )
{
m_tTestMatch.m_iDocID = pCand->m_uDocid;
//如果是内联存储
if ( pCand->m_pDocinfo )
memcpy ( m_tTestMatch.m_pDynamic, pCand->m_pDocinfo, m_iInlineRowitems*sizeof(CSphRowitem) );
if ( m_pIndex->EarlyReject ( m_pCtx, m_tTestMatch ) )
{
pCand++;
continue;
}
m_dMyDocs[iDocs] = *pCand;
m_tTestMatch.m_iWeight = (int)( (pCand->m_fTFIDF+0.5f)*SPH_BM25_SCALE ); // FIXME! bench bNeedBM25
Swap ( m_tTestMatch, m_dMyMatches[iDocs] );
iDocs++;
pCand++;
}
// clean up zone hash
if ( m_uMaxID!=DOCID_MAX )
{
ARRAY_FOREACH ( i, m_dZoneMin )
{
SphDocID_t uMinDocid = m_dZoneMin[i];
if ( uMinDocid==DOCID_MAX )
continue;
ZoneKey_t tZoneStart;
tZoneStart.m_iZone = i;
tZoneStart.m_uDocid = uMinDocid;
Verify ( m_hZoneInfo.IterateStart ( tZoneStart ) );
uMinDocid = DOCID_MAX;
do
{
ZoneKey_t tKey = m_hZoneInfo.IterateGetKey();
if ( tKey.m_iZone!=i || tKey.m_uDocid>m_uMaxID )
{
uMinDocid = ( tKey.m_iZone==i ) ? tKey.m_uDocid : DOCID_MAX;
break;
}
m_hZoneInfo.Delete ( tKey );
} while ( m_hZoneInfo.IterateNext() );
m_dZoneMin[i] = uMinDocid;
}
}
if ( iDocs )
{
m_dMyDocs[iDocs].m_uDocid = DOCID_MAX;
return m_dMyDocs;
}
}
}
void ExtRanker_c::SetQwordsIDF ( const ExtQwordsHash_t & hQwords )
{
m_iQwords = hQwords.GetLength ();
m_iMaxQuerypos = 0;
hQwords.IterateStart();
while ( hQwords.IterateNext() )
m_iMaxQuerypos = Max ( m_iMaxQuerypos, hQwords.IterateGet().m_iQueryPos );
if ( m_pRoot )
m_pRoot->SetQwordsIDF ( hQwords );
}
SphZoneHit_e ExtRanker_c::IsInZone ( int iZone, const ExtHit_t * pHit )
{
// quick route, we have current docid cached
ZoneKey_t tKey; // OPTIMIZE? allow 2-component hash keys maybe?
tKey.m_uDocid = pHit->m_uDocid;
tKey.m_iZone = iZone;
ZoneInfo_t * pZone = m_hZoneInfo ( tKey );
if ( pZone )
{
// remove end markers that might mess up ordering
Hitpos_t uPos = HITMAN::GetLCS ( pHit->m_uHitpos );
int iSpan = FindSpan ( pZone->m_dStarts, uPos );
return ( iSpan>=0 && uPos<=pZone->m_dEnds[iSpan] ) ? SPH_ZONE_FOUND : SPH_ZONE_NO_SPAN;
}
// is there any zone info for this document at all?
if ( pHit->m_uDocid<=m_dZoneMax[iZone] )
return SPH_ZONE_NO_DOCUMENT;
// long route, read in zone info for all (!) the documents until next requested
// that's because we might be queried out of order
// current chunk
const ExtDoc_t * pStart = m_dZoneStart[iZone];
const ExtDoc_t * pEnd = m_dZoneEnd[iZone];
// now keep caching spans until we see current id
while ( pHit->m_uDocid > m_dZoneMax[iZone] )
{
// get more docs if needed
if ( ( !pStart && m_dZoneMax[iZone]!=DOCID_MAX ) || pStart->m_uDocid==DOCID_MAX )
{
pStart = m_dZoneStartTerm[iZone]->GetDocsChunk ( NULL );
if ( !pStart )
{
m_dZoneMax[iZone] = DOCID_MAX;
return SPH_ZONE_NO_DOCUMENT;
}
}
if ( ( !pEnd && m_dZoneMax[iZone]!=DOCID_MAX ) || pEnd->m_uDocid==DOCID_MAX )
{
pEnd = m_dZoneEndTerm[iZone]->GetDocsChunk ( NULL );
if ( !pEnd )
{
m_dZoneMax[iZone] = DOCID_MAX;
return SPH_ZONE_NO_DOCUMENT;
}
}
assert ( pStart && pEnd );
// skip zone starts past already cached stuff
while ( pStart->m_uDocid<=m_dZoneMax[iZone] )
pStart++;
if ( pStart->m_uDocid==DOCID_MAX )
continue;
// skip zone ends until a match with start
while ( pEnd->m_uDocid<pStart->m_uDocid )
pEnd++;
if ( pEnd->m_uDocid==DOCID_MAX )
continue;
// handle mismatching start/end ids
// (this must never happen normally, but who knows what data we're fed)
assert ( pStart->m_uDocid!=DOCID_MAX );
assert ( pEnd->m_uDocid!=DOCID_MAX );
assert ( pStart->m_uDocid<=pEnd->m_uDocid );
if ( pStart->m_uDocid!=pEnd->m_uDocid )
{
while ( pStart->m_uDocid < pEnd->m_uDocid )
pStart++;
if ( pStart->m_uDocid==DOCID_MAX )
continue;
}
// first matching uncached docid found!
assert ( pStart->m_uDocid==pEnd->m_uDocid );
assert ( pStart->m_uDocid > m_dZoneMax[iZone] );
// but maybe we don't need docid this big just yet?
if ( pStart->m_uDocid > pHit->m_uDocid )
{
// store current in-chunk positions
m_dZoneStart[iZone] = pStart;
m_dZoneEnd[iZone] = pEnd;
// no zone info for all those precending documents (including requested one)
m_dZoneMax[iZone] = pStart->m_uDocid-1;
return SPH_ZONE_NO_DOCUMENT;
}
// cache all matching docs from current chunks below requested docid
// (there might be more matching docs, but we are lazy and won't cache them upfront)
ExtDoc_t dCache [ ExtNode_i::MAX_DOCS ];
int iCache = 0;
while ( pStart->m_uDocid<=pHit->m_uDocid )
{
// match
if ( pStart->m_uDocid==pEnd->m_uDocid )
{
dCache[iCache++] = *pStart;
pStart++;
pEnd++;
continue;
}
// mismatch!
// this must not really happen, starts/ends must be in sync
// but let's be graceful anyway, and just skip to next match
if ( pStart->m_uDocid==DOCID_MAX || pEnd->m_uDocid==DOCID_MAX )
break;
while ( pStart->m_uDocid < pEnd->m_uDocid )
pStart++;
if ( pStart->m_uDocid==DOCID_MAX )
break;
while ( pEnd->m_uDocid < pStart->m_uDocid )
pEnd++;
if ( pEnd->m_uDocid==DOCID_MAX )
break;
}
// should have found at least one id to cache
assert ( iCache );
assert ( iCache < ExtNode_i::MAX_DOCS );
dCache[iCache].m_uDocid = DOCID_MAX;
// do caching
const ExtHit_t * pStartHits = m_dZoneStartTerm[iZone]->GetHitsChunk ( dCache, DOCID_MAX );
const ExtHit_t * pEndHits = m_dZoneEndTerm[iZone]->GetHitsChunk ( dCache, DOCID_MAX );
// loop documents one by one
while ( pStartHits && pEndHits )
{
// load all hits for current document
SphDocID_t uCur = pStartHits->m_uDocid;
tKey.m_uDocid = uCur;
m_hZoneInfo.Add ( ZoneInfo_t(), tKey );
pZone = m_hZoneInfo ( tKey ); // OPTIMIZE? return pointer from Add()?
// load all the start hits for it
while ( pStartHits )
{
while ( pStartHits->m_uDocid==uCur )
{
pZone->m_dStarts.Add ( pStartHits->m_uHitpos );
pStartHits++;
}
if ( pStartHits->m_uDocid!=DOCID_MAX )
break;
pStartHits = m_dZoneStartTerm[iZone]->GetHitsChunk ( dCache, DOCID_MAX );
}
// load all the end hits for it
assert ( pEndHits->m_uDocid==uCur );
while ( pEndHits )
{
while ( pEndHits->m_uDocid==uCur )
{
pZone->m_dEnds.Add ( pEndHits->m_uHitpos );
pEndHits++;
}
if ( pEndHits->m_uDocid!=DOCID_MAX )
break;
pEndHits = m_dZoneEndTerm[iZone]->GetHitsChunk ( dCache, DOCID_MAX );
}
// data sanity checks
assert ( pZone->m_dStarts.GetLength()==pZone->m_dEnds.GetLength() );
// update cache status
m_dZoneMax[iZone] = uCur;
m_dZoneMin[iZone] = Min ( m_dZoneMin[iZone], uCur );
}
}
// store current in-chunk positions
m_dZoneStart[iZone] = pStart;
m_dZoneEnd[iZone] = pEnd;
// cached a bunch of spans, try our check again
tKey.m_uDocid = pHit->m_uDocid;
pZone = m_hZoneInfo ( tKey );
if ( pZone )
{
// remove end markers that might mess up ordering
Hitpos_t uPos = HITMAN::GetLCS ( pHit->m_uHitpos );
int iSpan = FindSpan ( pZone->m_dStarts, uPos );
return ( iSpan>=0 && uPos<=pZone->m_dEnds[iSpan] ) ? SPH_ZONE_FOUND : SPH_ZONE_NO_SPAN;
}
return SPH_ZONE_NO_DOCUMENT;
}