ExtRanker_T < RankerState_ProximityBM25Exact_fn > ( tXQ, tTermSetup )

需要解决的问题:

ExtRanker_T为什么会有关键字信息?

以及他的构造函数做了什么?


pRanker = new ExtRanker_T < RankerState_ProximityBM25Exact_fn > ( tXQ, tTermSetup );

tXQ其实是一颗关键字的树,里边包含搜索关键字的方法。所以在构造ExtRanker_T的时候,打分排序类就包含了关键字的信息。

ExtQwordsHash_t hQwords;
pRanker->GetQwords ( hQwords );

然后又把关键字的一些个信息赋给hQwords,不明白是为了什么?难道这样方便处理?

pRanker的实例,

ExtRanker_T<STATE> ( constXQQuery_t & tXQ, constISphQwordSetup & tSetup ) :ExtRanker_c ( tXQ, tSetup ) {}

通过上边可以发现,构造函数是ExtRanker_c的构造函数

ExtRanker_c::ExtRanker_c ( const XQQuery_t & tXQ, const ISphQwordSetup & tSetup )
{
	assert ( tSetup.m_pCtx );

	m_iInlineRowitems = tSetup.m_iInlineRowitems;
	for ( int i=0; i<ExtNode_i::MAX_DOCS; i++ )
	{
		m_dMatches[i].Reset ( tSetup.m_iDynamicRowitems );
		m_dMyMatches[i].Reset ( tSetup.m_iDynamicRowitems );
	}
	m_tTestMatch.Reset ( tSetup.m_iDynamicRowitems );

	assert ( tXQ.m_pRoot );
	tSetup.m_pZoneChecker = this;
	m_pRoot = ExtNode_i::Create ( tXQ.m_pRoot, tSetup );
#if SPH_TREE_DUMP
	if ( m_pRoot )
		m_pRoot->DebugDump(0);
#endif

	m_pDoclist = NULL;
	m_pHitlist = NULL;
	m_uMaxID = 0;
	m_uPayloadMask = 0;
	m_iQwords = 0;
	m_pIndex = tSetup.m_pIndex;
	m_pCtx = tSetup.m_pCtx;

	m_dZones = tXQ.m_dZones;
	m_dZoneStart.Resize ( m_dZones.GetLength() );
	m_dZoneEnd.Resize ( m_dZones.GetLength() );
	m_dZoneMax.Resize ( m_dZones.GetLength() );
	m_dZoneMin.Resize ( m_dZones.GetLength() );
	m_dZoneMax.Fill ( 0 );
	m_dZoneMin.Fill	( DOCID_MAX );

	ARRAY_FOREACH ( i, m_dZones )
	{
		XQKeyword_t tDot;

		tDot.m_sWord.SetSprintf ( "%c%s", MAGIC_CODE_ZONE, m_dZones[i].cstr() );
		m_dZoneStartTerm.Add ( new ExtTerm_c ( CreateQueryWord ( tDot, tSetup ), tSetup ) );
		m_dZoneStart[i] = NULL;

		tDot.m_sWord.SetSprintf ( "%c/%s", MAGIC_CODE_ZONE, m_dZones[i].cstr() );
		m_dZoneEndTerm.Add ( new ExtTerm_c ( CreateQueryWord ( tDot, tSetup ), tSetup ) );
		m_dZoneEnd[i] = NULL;
	}
}


ExtRanker_c::~ExtRanker_c ()
{
	SafeDelete ( m_pRoot );
	ARRAY_FOREACH ( i, m_dZones )
	{
		SafeDelete ( m_dZoneStartTerm[i] );
		SafeDelete ( m_dZoneEndTerm[i] );
	}
}

void ExtRanker_c::Reset ( const ISphQwordSetup & tSetup )
{
	if ( m_pRoot )
		m_pRoot->Reset ( tSetup );
	ARRAY_FOREACH ( i, m_dZones )
	{
		m_dZoneStartTerm[i]->Reset ( tSetup );
		m_dZoneEndTerm[i]->Reset ( tSetup );

		m_dZoneStart[i] = NULL;
		m_dZoneEnd[i] = NULL;
	}

	m_dZoneMax.Fill ( 0 );
	m_dZoneMin.Fill ( DOCID_MAX );
	m_hZoneInfo.Reset();
}


const ExtDoc_t * ExtRanker_c::GetFilteredDocs ()
{
	for ( ;; )
	{
		// get another chunk
		m_uMaxID = 0;
		const ExtDoc_t * pCand = m_pRoot->GetDocsChunk ( &m_uMaxID );
		if ( !pCand )
			return NULL;

		// create matches, and filter them
		int iDocs = 0;
		while ( pCand->m_uDocid!=DOCID_MAX )
		{
			m_tTestMatch.m_iDocID = pCand->m_uDocid;
			//如果是内联存储
			if ( pCand->m_pDocinfo )
				memcpy ( m_tTestMatch.m_pDynamic, pCand->m_pDocinfo, m_iInlineRowitems*sizeof(CSphRowitem) );

			if ( m_pIndex->EarlyReject ( m_pCtx, m_tTestMatch ) )
			{
				pCand++;
				continue;
			}

			m_dMyDocs[iDocs] = *pCand;
			m_tTestMatch.m_iWeight = (int)( (pCand->m_fTFIDF+0.5f)*SPH_BM25_SCALE ); // FIXME! bench bNeedBM25
			Swap ( m_tTestMatch, m_dMyMatches[iDocs] );
			iDocs++;
			pCand++;
		}

		// clean up zone hash
		if ( m_uMaxID!=DOCID_MAX )
		{
			ARRAY_FOREACH ( i, m_dZoneMin )
			{
				SphDocID_t uMinDocid = m_dZoneMin[i];
				if ( uMinDocid==DOCID_MAX )
					continue;

				ZoneKey_t tZoneStart;
				tZoneStart.m_iZone = i;
				tZoneStart.m_uDocid = uMinDocid;
				Verify ( m_hZoneInfo.IterateStart ( tZoneStart ) );
				uMinDocid = DOCID_MAX;
				do
				{
					ZoneKey_t tKey = m_hZoneInfo.IterateGetKey();
					if ( tKey.m_iZone!=i || tKey.m_uDocid>m_uMaxID )
					{
						uMinDocid = ( tKey.m_iZone==i ) ? tKey.m_uDocid : DOCID_MAX;
						break;
					}

					m_hZoneInfo.Delete ( tKey );
				} while ( m_hZoneInfo.IterateNext() );

				m_dZoneMin[i] = uMinDocid;
			}
		}

		if ( iDocs )
		{
			m_dMyDocs[iDocs].m_uDocid = DOCID_MAX;
			return m_dMyDocs;
		}
	}
}


void ExtRanker_c::SetQwordsIDF ( const ExtQwordsHash_t & hQwords )
{
	m_iQwords = hQwords.GetLength ();

	m_iMaxQuerypos = 0;
	hQwords.IterateStart();
	while ( hQwords.IterateNext() )
		m_iMaxQuerypos = Max ( m_iMaxQuerypos, hQwords.IterateGet().m_iQueryPos );

	if ( m_pRoot )
		m_pRoot->SetQwordsIDF ( hQwords );
}


SphZoneHit_e ExtRanker_c::IsInZone ( int iZone, const ExtHit_t * pHit )
{
	// quick route, we have current docid cached
	ZoneKey_t tKey; // OPTIMIZE? allow 2-component hash keys maybe?
	tKey.m_uDocid = pHit->m_uDocid;
	tKey.m_iZone = iZone;

	ZoneInfo_t * pZone = m_hZoneInfo ( tKey );
	if ( pZone )
	{
		// remove end markers that might mess up ordering
		Hitpos_t uPos = HITMAN::GetLCS ( pHit->m_uHitpos );
		int iSpan = FindSpan ( pZone->m_dStarts, uPos );
		return ( iSpan>=0 && uPos<=pZone->m_dEnds[iSpan] ) ? SPH_ZONE_FOUND : SPH_ZONE_NO_SPAN;
	}

	// is there any zone info for this document at all?
	if ( pHit->m_uDocid<=m_dZoneMax[iZone] )
		return SPH_ZONE_NO_DOCUMENT;

	// long route, read in zone info for all (!) the documents until next requested
	// that's because we might be queried out of order

	// current chunk
	const ExtDoc_t * pStart = m_dZoneStart[iZone];
	const ExtDoc_t * pEnd = m_dZoneEnd[iZone];

	// now keep caching spans until we see current id
	while ( pHit->m_uDocid > m_dZoneMax[iZone] )
	{
		// get more docs if needed
		if ( ( !pStart && m_dZoneMax[iZone]!=DOCID_MAX ) || pStart->m_uDocid==DOCID_MAX )
		{
			pStart = m_dZoneStartTerm[iZone]->GetDocsChunk ( NULL );
			if ( !pStart )
			{
				m_dZoneMax[iZone] = DOCID_MAX;
				return SPH_ZONE_NO_DOCUMENT;
			}
		}

		if ( ( !pEnd && m_dZoneMax[iZone]!=DOCID_MAX ) || pEnd->m_uDocid==DOCID_MAX )
		{
			pEnd = m_dZoneEndTerm[iZone]->GetDocsChunk ( NULL );
			if ( !pEnd )
			{
				m_dZoneMax[iZone] = DOCID_MAX;
				return SPH_ZONE_NO_DOCUMENT;
			}
		}

		assert ( pStart && pEnd );

		// skip zone starts past already cached stuff
		while ( pStart->m_uDocid<=m_dZoneMax[iZone] )
			pStart++;
		if ( pStart->m_uDocid==DOCID_MAX )
			continue;

		// skip zone ends until a match with start
		while ( pEnd->m_uDocid<pStart->m_uDocid )
			pEnd++;
		if ( pEnd->m_uDocid==DOCID_MAX )
			continue;

		// handle mismatching start/end ids
		// (this must never happen normally, but who knows what data we're fed)
		assert ( pStart->m_uDocid!=DOCID_MAX );
		assert ( pEnd->m_uDocid!=DOCID_MAX );
		assert ( pStart->m_uDocid<=pEnd->m_uDocid );

		if ( pStart->m_uDocid!=pEnd->m_uDocid )
		{
			while ( pStart->m_uDocid < pEnd->m_uDocid )
				pStart++;
			if ( pStart->m_uDocid==DOCID_MAX )
				continue;
		}

		// first matching uncached docid found!
		assert ( pStart->m_uDocid==pEnd->m_uDocid );
		assert ( pStart->m_uDocid > m_dZoneMax[iZone] );

		// but maybe we don't need docid this big just yet?
		if ( pStart->m_uDocid > pHit->m_uDocid )
		{
			// store current in-chunk positions
			m_dZoneStart[iZone] = pStart;
			m_dZoneEnd[iZone] = pEnd;

			// no zone info for all those precending documents (including requested one)
			m_dZoneMax[iZone] = pStart->m_uDocid-1;
			return SPH_ZONE_NO_DOCUMENT;
		}

		// cache all matching docs from current chunks below requested docid
		// (there might be more matching docs, but we are lazy and won't cache them upfront)
		ExtDoc_t dCache [ ExtNode_i::MAX_DOCS ];
		int iCache = 0;

		while ( pStart->m_uDocid<=pHit->m_uDocid )
		{
			// match
			if ( pStart->m_uDocid==pEnd->m_uDocid )
			{
				dCache[iCache++] = *pStart;
				pStart++;
				pEnd++;
				continue;
			}

			// mismatch!
			// this must not really happen, starts/ends must be in sync
			// but let's be graceful anyway, and just skip to next match
			if ( pStart->m_uDocid==DOCID_MAX || pEnd->m_uDocid==DOCID_MAX )
				break;

			while ( pStart->m_uDocid < pEnd->m_uDocid )
				pStart++;
			if ( pStart->m_uDocid==DOCID_MAX )
				break;

			while ( pEnd->m_uDocid < pStart->m_uDocid )
				pEnd++;
			if ( pEnd->m_uDocid==DOCID_MAX )
				break;
		}

		// should have found at least one id to cache
		assert ( iCache );
		assert ( iCache < ExtNode_i::MAX_DOCS );
		dCache[iCache].m_uDocid = DOCID_MAX;

		// do caching
		const ExtHit_t * pStartHits = m_dZoneStartTerm[iZone]->GetHitsChunk ( dCache, DOCID_MAX );
		const ExtHit_t * pEndHits = m_dZoneEndTerm[iZone]->GetHitsChunk ( dCache, DOCID_MAX );

		// loop documents one by one
		while ( pStartHits && pEndHits )
		{
			// load all hits for current document
			SphDocID_t uCur = pStartHits->m_uDocid;

			tKey.m_uDocid = uCur;
			m_hZoneInfo.Add ( ZoneInfo_t(), tKey );
			pZone = m_hZoneInfo ( tKey ); // OPTIMIZE? return pointer from Add()?

			// load all the start hits for it
			while ( pStartHits )
			{
				while ( pStartHits->m_uDocid==uCur )
				{
					pZone->m_dStarts.Add ( pStartHits->m_uHitpos );
					pStartHits++;
				}
				if ( pStartHits->m_uDocid!=DOCID_MAX )
					break;
				pStartHits = m_dZoneStartTerm[iZone]->GetHitsChunk ( dCache, DOCID_MAX );
			}

			// load all the end hits for it
			assert ( pEndHits->m_uDocid==uCur );
			while ( pEndHits )
			{
				while ( pEndHits->m_uDocid==uCur )
				{
					pZone->m_dEnds.Add ( pEndHits->m_uHitpos );
					pEndHits++;
				}
				if ( pEndHits->m_uDocid!=DOCID_MAX )
					break;
				pEndHits = m_dZoneEndTerm[iZone]->GetHitsChunk ( dCache, DOCID_MAX );
			}

			// data sanity checks
			assert ( pZone->m_dStarts.GetLength()==pZone->m_dEnds.GetLength() );

			// update cache status
			m_dZoneMax[iZone] = uCur;
			m_dZoneMin[iZone] = Min ( m_dZoneMin[iZone], uCur );
		}
	}

	// store current in-chunk positions
	m_dZoneStart[iZone] = pStart;
	m_dZoneEnd[iZone] = pEnd;

	// cached a bunch of spans, try our check again
	tKey.m_uDocid = pHit->m_uDocid;
	pZone = m_hZoneInfo ( tKey );
	if ( pZone )
	{
		// remove end markers that might mess up ordering
		Hitpos_t uPos = HITMAN::GetLCS ( pHit->m_uHitpos );
		int iSpan = FindSpan ( pZone->m_dStarts, uPos );
		return ( iSpan>=0 && uPos<=pZone->m_dEnds[iSpan] ) ? SPH_ZONE_FOUND : SPH_ZONE_NO_SPAN;
	}

	return SPH_ZONE_NO_DOCUMENT;
}


  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值