广告系统中weak-and算法原理及编码验证

最新推荐文章于 2024-07-14 08:52:33 发布

weixin_30496431

最新推荐文章于 2024-07-14 08:52:33 发布

阅读量199

点赞数

文章标签： python

原文链接：http://www.cnblogs.com/besty/p/3917785.html

版权

wand(weak and)算法基本思路

　　一般搜索的query比较短，但如果query比较长，如是一段文本，需要搜索相似的文本，这时候一般就需要wand算法，该算法在广告系统中有比较成熟的应

该，主要是adsense场景，需要搜索一个页面内容的相似广告。

　　Wand方法简单来说，一般我们在计算文本相关性的时候，会通过倒排索引的方式进行查询，通过倒排索引已经要比全量遍历节约大量时间，但是有时候仍

然很慢。

　　原因是很多时候我们其实只是想要top n个结果，一些结果明显较差的也进行了复杂的相关性计算，而weak-and算法通过计算每个词的贡献上限来估计文档

的相关性上限，从而建立一个阈值对倒排中的结果进行减枝，从而得到提速的效果。

　　 wand算法首先要估计每个词对相关性贡献的上限，最简单的相关性就是TF*IDF，一般query中词的TF均为1，IDF是固定的，因此就是估计一个词在文档中的

词频TF上限，一般TF需要归一化，即除以文档所有词的个数，因此，就是要估算一个词在文档中所能占到的最大比例，这个线下计算即可。

　　知道了一个词的相关性上界值，就可以知道一个query和一个文档的相关性上限值，显然就是他们共同的词的相关性上限值的和。

　　这样对于一个query，获得其所有词的相关性贡献上限，然后对一个文档，看其和query中都出现的词，然后求这些词的贡献和即可，然后和一个预设值比

较，如果超过预设值，则进入下一步的计算，否则则丢弃。

　　如果按照这样的方法计算n个最相似文档，就要取出所有的文档，每个文档作预计算，比较threshold，然后决定是否在top-n之列。这样计算当然可行，但

是还是可以优化的。

wand(weak and)算法原理演示

代码实现了主要的算法逻辑以验证算法的有效性，供大家参考，该实现优化了原始算法的一些逻辑尽量减少了无谓的循环：

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

 
       www.169it.com 
      
       #!/usr/bin/python 
      
       #wangben updated 20130108 
      
       class 
       WAND: 
      
       '''implement wand algorithm''' 
      
       def 
       __init__( 
       self 
       , InvertIndex, last_docid): 
      
       self 
       .invert_index  
       = 
       InvertIndex  
       #InvertIndex: term -> docid1, docid2, docid3 ... 
      
       self 
       .current_doc  
       = 
       0 
      
       self 
       .current_invert_index  
       = 
       {} 
      
       self 
       .query_terms  
       = 
       [] 
      
       self 
       .threshold  
       = 
       2 
      
       self 
       .sort_terms  
       = 
       [] 
      
       self 
       .LastID  
       = 
       2000000000 
       #big num 
      
       self 
       .debug_count  
       = 
       0 
      
       self 
       .last_docid  
       = 
       last_docid 
      
       def 
       __InitQuery( 
       self 
       , query_terms): 
      
       '''check terms len > 0''' 
      
       self 
       .current_doc  
       = 
       - 
       1 
      
       self 
       .current_invert_index.clear() 
      
       self 
       .query_terms  
       = 
       query_terms 
      
       self 
       .sort_terms[:]  
       = 
       [] 
      
       self 
       .debug_count  
       = 
       0 
      
       for 
       term  
       in 
       query_terms: 
      
       #initial start pos from the first position of term's invert_index 
      
       self 
       .current_invert_index[term]  
       = 
       [  
       self 
       .invert_index[term][ 
       0 
       ],  
       0 
       ]  
       #[ docid, index ] 
      
       def 
       __SortTerms( 
       self 
       ): 
      
       if 
       len 
       ( 
       self 
       .sort_terms)  
       = 
       = 
       0 
       : 
      
       for 
       term  
       in 
       self 
       .query_terms: 
      
       if 
       term  
       in 
       self 
       .current_invert_index: 
      
       doc_id  
       = 
       self 
       .current_invert_index[term][ 
       0 
       ] 
      
       self 
       .sort_terms.append([  
       int 
       (doc_id), term ]) 
      
       self 
       .sort_terms.sort() 
      
       def 
       __PickTerm( 
       self 
       , pivot_index): 
      
       return 
       0 
      
       def 
       __FindPivotTerm( 
       self 
       ): 
      
       score  
       = 
       0 
      
       for 
       i  
       in 
       range 
       ( 
       0 
       ,  
       len 
       ( 
       self 
       .sort_terms)): 
      
       score  
       + 
       = 
       1 
      
       if 
       score > 
       = 
       self 
       .threshold: 
      
       return 
       [  
       self 
       .sort_terms[i][ 
       1 
       ], i] 
      
       return 
       [  
       None 
       ,  
       len 
       ( 
       self 
       .sort_terms) ] 
      
       def 
       __IteratorInvertIndex( 
       self 
       , change_term, docid, pos): 
      
       '''move to doc id > docid''' 
      
       doc_list  
       = 
       self 
       .invert_index[change_term] 
      
       i  
       = 
       0 
      
       for 
       i  
       in 
       range 
       (pos,  
       len 
       (doc_list)): 
      
       if 
       doc_list[i] > 
       = 
       docid: 
      
       pos  
       = 
       i 
      
       docid  
       = 
       doc_list[i] 
      
       break 
      
       return 
       [ docid, pos ] 
      
       def 
       __AdvanceTerm( 
       self 
       , change_index, docid ): 
      
       change_term  
       = 
       self 
       .sort_terms[change_index][ 
       1 
       ] 
      
       pos  
       = 
       self 
       .current_invert_index[change_term][ 
       1 
       ] 
      
       (new_doc, new_pos)  
       = 
       \ 
      
       self 
       .__IteratorInvertIndex(change_term, docid, pos) 
      
       self 
       .current_invert_index[change_term]  
       = 
       \ 
      
       [ new_doc , new_pos ] 
      
       self 
       .sort_terms[change_index][ 
       0 
       ]  
       = 
       new_doc 
      
       def 
       __Next( 
       self 
       ): 
      
       if 
       self 
       .last_docid  
       = 
       = 
       self 
       .current_doc: 
      
       return 
       None 
      
       while 
       True 
       : 
      
       self 
       .debug_count  
       + 
       = 
       1 
      
       #sort terms by doc id 
      
       self 
       .__SortTerms() 
      
       #find pivot term > threshold 
      
       (pivot_term, pivot_index)  
       = 
       self 
       .__FindPivotTerm() 
      
       if 
       pivot_term  
       = 
       = 
       None 
       : 
      
       #no more candidate 
      
       return 
       None 
      
       #debug_info: 
      
       for 
       i  
       in 
       range 
       ( 
       0 
       , pivot_index  
       + 
       1 
       ): 
      
       print 
       self 
       .sort_terms[i][ 
       0 
       ], 
       self 
       .sort_terms[i][ 
       1 
       ], 
       "|" 
       , 
      
       print 
       "" 
      
       pivot_doc_id  
       = 
       self 
       .current_invert_index[pivot_term][ 
       0 
       ] 
      
       if 
       pivot_doc_id  
       = 
       = 
       self 
       .LastID:  
       #!! 
      
       return 
       None 
      
       if 
       pivot_doc_id < 
       = 
       self 
       .current_doc: 
      
       change_index  
       = 
       self 
       .__PickTerm(pivot_index) 
      
       self 
       .__AdvanceTerm( change_index,  
       self 
       .current_doc  
       + 
       1 
       ) 
      
       else 
       : 
      
       first_docid  
       = 
       self 
       .sort_terms[ 
       0 
       ][ 
       0 
       ] 
      
       if 
       pivot_doc_id  
       = 
       = 
       first_docid: 
      
       self 
       .current_doc  
       = 
       pivot_doc_id 
      
       return 
       self 
       .current_doc 
      
       else 
       : 
      
       #pick all preceding term 
      
       for 
       i  
       in 
       range 
       ( 
       0 
       , pivot_index): 
      
       change_index  
       = 
       i 
      
       self 
       .__AdvanceTerm( change_index, pivot_doc_id ) 
      
       def 
       DoQuery( 
       self 
       , query_terms): 
      
       self 
       .__InitQuery(query_terms) 
      
       while 
       True 
       : 
      
       candidate_docid  
       = 
       self 
       .__Next() 
      
       if 
       candidate_docid  
       = 
       = 
       None 
       : 
      
       break 
      
       print 
       "candidate_docid:" 
       ,candidate_docid 
      
       #insert candidate_docid to heap 
      
       #update threshold 
      
       print 
       "debug_count:" 
       , 
       self 
       .debug_count 
      
       if 
       __name__  
       = 
       = 
       "__main__" 
       : 
      
       testIndex  
       = 
       {} 
      
       testIndex[ 
       "t1" 
       ]  
       = 
       [  
       0 
       ,  
       1 
       ,  
       2 
       ,  
       3 
       ,  
       6 
       ,  
       2000000000 
       ] 
      
       testIndex[ 
       "t2" 
       ]  
       = 
       [  
       3 
       ,  
       4 
       ,  
       5 
       ,  
       6 
       ,  
       2000000000 
       ] 
      
       testIndex[ 
       "t3" 
       ]  
       = 
       [  
       2 
       ,  
       5 
       ,  
       2000000000 
       ] 
      
       testIndex[ 
       "t4" 
       ]  
       = 
       [  
       4 
       ,  
       6 
       ,  
       2000000000 
       ] 
      
       w  
       = 
       WAND(testIndex,  
       6 
       ) 
      
       w.DoQuery([ 
       "t1" 
       ,  
       "t2" 
       ,  
       "t3" 
       ,  
       "t4" 
       ])