50万邮件文本分域检索与查询的python实现(6)

最后一节介绍分区域布尔查询的实现

分区域查询现在可以支持单个词查询(xxx.region),OR(xxx1.region1 OR xxx2.region2),NOT(xxx1.region1NOT xxx2.region2, NOT xxx1.region1),AND(xxx1.region1 AND xxx2.region2,xxx1.region1 AND xxx2.region2AND xxx3.region3)这几种形式,region可以为‘to’、‘from’、‘subject’、‘body’、‘date’。

对于每个布尔查询,首先确定其查询类型,然后再调用相应的子程序进行处理。以OR为例,or_split函数返回的是分词的结果,即xxx1.region1和xxx2.region2,然后对于每个词,调用split_field进行分域,将xxx和region分开,然后到相应的region里检索词条xxx。由于是OR查询,因此对两个集合取并集后打印出结果。

懒得写了,直接给出代码,代码写的比较烂。

#================================================================
# =====================Multi-field Boolean Query================= 

import re
import pickle

#-------------------------
# ------functions------

#---func 1---
# import id-doc list from 'dbase_doc_id'
def opendb():
 opdb=open('dbase_id_doc','r')
 iddoc=pickle.load(opdb)
 return iddoc 

#---func 2---
# Check the splited tokens whether in the given mapping-dictionary or not.
def check_existence(word,mapping):
 if word not in mapping.keys():
  print '* Opps, doesn\'t have token: \'',word,'\''
  print '* Please Try again...'
  return False
 else:
  return True

#---func 3---
# Split Field and Token ,like 'jfk.subject'->token:'jfk', field:'subject',
# and put the relevant docs (with, for example:'jfk') into 'set'.

def split_field(token_input,to_m,f_m,d_m,s_m,b_m):

 to_mapping=to_m
 from_mapping=f_m
 date_mapping=d_m
 subject_mapping=s_m
 body_mapping=b_m
 qu_tt=''
 content=''
 
 #--the explaination of the regular expression is shown in main()
 qu_t=re.findall('\.[\w]+$',token_input)
 cnt=re.findall('.*[\.]',token_input)
 for i in cnt:
  cnt_tmp=i[0:-1]
  content=cnt_tmp
   
 if qu_t=='':
  print '* Not given field! Try again. (Correct field are: .to, .from, .date, .subject, .body)'
  #---patient! not 'pass', the difference between 'return' and 'pass'
  #-------------------------------------------------------------------------------------
  #---'handle'is an import signal, telling main() to deal with the error, 
  #---rather than the empty 'set' type.
  #---(for the Return Value of split_field() is given to a variable,
  #---if the variable further given to a funtion ,then will cause Mismatch Type error!)
  #-------------------------------------------------------------------------------------
  return 'handle'
 else:
  for i in qu_t:
   qu_tt=i[1:]

 dbases={'to':to_mapping,'from':from_mapping,'date':date_mapping,'subject':subject_mapping,'body':body_mapping,}
 if qu_tt not in dbases.keys():
  print '* Field input error! Try again. (Correct field are: .to, .from, .date, .subject, .body)'
  return 'handle'
 else:
  # dbases[qu_tt] is mapping-dictionary name
  print '* -- Query Field is :\'',qu_tt,'\' ,Token is:\'',content,'\' --'
  if check_existence(content,dbases[qu_tt])==True:
   set_doc=set()
   for item in dbases[qu_tt][content]:
    set_doc.add(item[0])
   #--Pay attention to the location of 'return set_doc',or will cause only one output--
   return set_doc
  else:
   return 'handle'

   
#---func 4---
# special function for AND, get the query field
def get_AND_field(token_input,to_m,f_m,d_m,s_m,b_m):

 to_mapping=to_m
 from_mapping=f_m
 date_mapping=d_m
 subject_mapping=s_m
 body_mapping=b_m
 qu_tt=''
 qu_t=re.findall('\.[\w]+$',token_input)
 for i in qu_t:
  qu_tt=i[1:]
 dbases={'to':to_mapping,'from':from_mapping,'date':date_mapping,'subject':subject_mapping,'body':body_mapping,}
 return dbases[qu_tt]

#---func 5---
# special function for AND, get the query content
def get_AND_content(token_input,to_m,f_m,d_m,s_m,b_m):

 to_mapping=to_m
 from_mapping=f_m
 date_mapping=d_m
 subject_mapping=s_m
 body_mapping=b_m
 content=''
 cnt=re.findall('.*[\.]',token_input)
 for i in cnt:
  cnt_tmp=i[0:-1]
  content=cnt_tmp
 return content


#---func 6,7,8---
# If you input an query string containing bool word: AND, NOT, and OR, 
# then function "*_split()" splits it into two words.
# For example,'jfk1.subject AND jfk2.subject' -> ['jfk1.subject','jfk2.subject']
# Attention: 
#  query like 'NOT jfk' is not included here, it will be processed in a different way (Check main() for details).

def and_split(query_token):
 query=re.split(' AND ',query_token)
 return query

def or_split(query_token):
 query=re.split(' OR ',query_token)
 return query

def not_split(query_token):
 query=re.split(' NOT ',query_token)
 return query

#---func 9---
# Print the query result.
def print_result(set,bool_identifier):
 print '****Hit docs for query: \'',bool_identifier,'\'****'

 id_doc=opendb()
 operation=['Field1 AND Field2 AND Field3','Field1 AND Field2']
 if bool_identifier in operation:
  print 'calculate the Cosine Similarity between each hit doc and query vector (just using \'tf\')'
  print 'Outputing the sorted result:'
  result_num=raw_input('< How many results do you want to print?(input \'all\' print all)>  ')
  if result_num=='all':
   m=1
   for i in set:
    print m,': ',id_doc[i[0]],'	  ( cosine similarity:',i[1],')'
    m=m+1
   print '****************************************************************'
  else:
   m=1
   for i in set:
    print m,': ',id_doc[i[0]],'	  ( cosine similarity:',i[1],')'
    m=m+1
    if m <= int(result_num):
     continue
    else:
     break
   print '****************************************************************'
 
 else:
  m=1
  for i in set:
   print m,': ',id_doc[i]
   m=m+1
  print '****************************************************************'

# --definations end--
#------------------------


def main():
 
 #-----init-----
 print ' ( ---- Query Support Multi-Field ----- )'
 print '* Import pickle files :\'dbase_to\'|\'dbase_from\'|\'dbase_date\'|\'dbase_subject\'|\'dbase_body\' from hard disk'
 print '* Load inverted lists :\'To\'|\'From\'|\'Date\'|\'Subject\'|\'mail body\' into memory. '
 print '* Please waiting...'
  
 db_to=open('dbase_to','r')
 to_mapping=pickle.load(db_to)
 print '* \'dbase_to\' Done!'
  
 db_from=open('dbase_from','r')
 from_mapping=pickle.load(db_from)
 print '* \'dbase_from\' Done!'
  
 db_date=open('dbase_date','r')
 date_mapping=pickle.load(db_date)
 print '* \'dbase_date\' Done!'
  
 db_subject=open('dbase_subject','r')
 subject_mapping=pickle.load(db_subject)
 print '* \'dbase_subject\' Done!'
  
 db_body=open('dbase_body','r')
 body_mapping=pickle.load(db_body)
 print '* \'dbase_body\' Done!'
  
 print '* Now, you can query!'
 print '* ( query like \'cash.body AND NOV.date\', fields MUST BE in Low Case, and MUST have one )'
 # ---init end----

 while True:
  query_input=raw_input('Please input query string(support Multi-Field boolean query \'AND\'|\'OR\'|\'NOT\', \'q\' to quit): ')
  
  if query_input=='q':
   exit()
  
  # ---elif No. 1----
  elif 'NOT ' in re.findall('^NOT ',query_input):
   # qu_tt is represent for field
   # content is query token
   qu_tt=''
   content=''
   
   #--1.get rid of 'NOT '
   query_input_tmp=query_input[4:]
   #--2.match '~.ddd' from end of the string
   qu_t=re.findall('\.[\w]+$',query_input_tmp)
   #--3.match the string till meet the last '.'
   cnt=re.findall('.*[\.]',query_input_tmp)
   #--4.because cnt is a list, even just has one element
   for i in cnt:
    #--5.get rid of the last '.' kept in the string generated in 3.
    cnt_tmp=i[0:-1]
    content=cnt_tmp
   
   if qu_t=='':
    print '* Not given field! Try again. (Correct field are: .to, .from, .date, .subject, .body)'
    #---patient! not 'return', the difference between 'return' and 'pass'
    pass
   else:
    for i in qu_t:
     qu_tt=i[1:]
   
   dbases={'to':to_mapping,'from':from_mapping,'date':date_mapping,'subject':subject_mapping,'body':body_mapping,}
   # ------if outside-----
   if qu_tt not in dbases.keys():
    print '* Field input error! Try again. (Correct field are: .to, .from, .date, .subject, .body)'
    pass
   else:
    # dbases[qu_tt]
    print '* -- Query Field is \'',qu_tt,'\', Token is:\'',content,'\' -- '
    # ----if inside----
    if check_existence(content,dbases[qu_tt])==True:
     print '****Hit docs for query: \'NOT',content,'\' in field \'',qu_tt,'\'****'
   
     complement_set=set()
     set_not=set()
     for item in dbases[qu_tt][content]:
      set_not.add(item[0])
    
     union_set=set()
     for key in dbases[qu_tt].keys():
      set_tmp=set()
      for doc in dbases[qu_tt][key]:
       set_tmp.add(doc[0])
      union_set=union_set | set_tmp
     complement_set=union_set - set_not
   
     id_doc=opendb()

     i=1
     for doc in complement_set:
      print i,': ',id_doc[doc]
      i=i+1
     print '******************************************'
   
    else:
     pass
    # ---if inside end---
   # ---if outside end---

  # ---elif No. 2---
  elif ' AND ' in re.findall(' AND ',query_input):
   sqt=and_split(query_input)
   #--dealing with 3 words AND query 
   if len(sqt)==3:
    set1=split_field(sqt[0],to_mapping,from_mapping,date_mapping,subject_mapping,body_mapping)
    set2=split_field(sqt[1],to_mapping,from_mapping,date_mapping,subject_mapping,body_mapping)
    set3=split_field(sqt[2],to_mapping,from_mapping,date_mapping,subject_mapping,body_mapping)
    if (set1=='handle')|(set2=='handle')|(set3=='handle'):
     pass
    else:
     intersection=set1 & set2 & set3
     # ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
     # consine similarity of every hit doc and the query vector
     map_cnt={}
     for i in intersection:
      tknum1=0
      tknum2=0
      tknum3=0
      map1=get_AND_field(sqt[0],to_mapping,from_mapping,date_mapping,subject_mapping,body_mapping)
      cont1=get_AND_content(sqt[0],to_mapping,from_mapping,date_mapping,subject_mapping,body_mapping)
      for token1 in map1[cont1]:
       if i==token1[0]:
        tknum1=token1[1]
        continue
      map2=get_AND_field(sqt[1],to_mapping,from_mapping,date_mapping,subject_mapping,body_mapping)
      cont2=get_AND_content(sqt[1],to_mapping,from_mapping,date_mapping,subject_mapping,body_mapping)
      for token2 in map2[cont2]:
       if i==token2[0]:
        tknum2=token2[1]
        continue
      map3=get_AND_field(sqt[2],to_mapping,from_mapping,date_mapping,subject_mapping,body_mapping)
      cont3=get_AND_content(sqt[2],to_mapping,from_mapping,date_mapping,subject_mapping,body_mapping)
      for token3 in map3[cont3]:
       if i==token3[0]:
        tknum3=token3[1]
        continue
      # cosine similarity
      tk=tknum1*1+tknum2*1+tknum3*1
      map_cnt.setdefault(i,[]).append(tk)
     
     # sort 
     sorted_list=[]
     sorted_list=sorted(map_cnt.iteritems(), key=lambda a:a[1], reverse=True)
     # +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
     print_result(sorted_list,'Field1 AND Field2 AND Field3')

   else:    
    set1=split_field(sqt[0],to_mapping,from_mapping,date_mapping,subject_mapping,body_mapping)
    set2=split_field(sqt[1],to_mapping,from_mapping,date_mapping,subject_mapping,body_mapping)
    if (set1=='handle')|(set2=='handle'):
     pass
    else:
     intersection=set1 & set2
     # ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
     # consine similarity of every hit doc and the query vector
     map_cnt={}
     for i in intersection:
      tknum1=0
      tknum2=0
      map1=get_AND_field(sqt[0],to_mapping,from_mapping,date_mapping,subject_mapping,body_mapping)
      cont1=get_AND_content(sqt[0],to_mapping,from_mapping,date_mapping,subject_mapping,body_mapping)
      for token1 in map1[cont1]:
       if i==token1[0]:
        tknum1=token1[1]
        continue
      map2=get_AND_field(sqt[1],to_mapping,from_mapping,date_mapping,subject_mapping,body_mapping)
      cont2=get_AND_content(sqt[1],to_mapping,from_mapping,date_mapping,subject_mapping,body_mapping)
      for token2 in map2[cont2]:
       if i==token2[0]:
        tknum2=token2[1]
        continue
      # cosine similarity
      tk=tknum1*1+tknum2*1
      map_cnt.setdefault(i,[]).append(tk)
     
     # sort 
     sorted_list=[]
     sorted_list=sorted(map_cnt.iteritems(), key=lambda a:a[1], reverse=True)
     # +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
     print_result(sorted_list,'Field1 AND Field2')

  # ---elif No. 3---
  elif ' OR ' in re.findall(' OR ',query_input):
   sqt=or_split(query_input)
   set1=split_field(sqt[0],to_mapping,from_mapping,date_mapping,subject_mapping,body_mapping)
   set2=split_field(sqt[1],to_mapping,from_mapping,date_mapping,subject_mapping,body_mapping)
   if (set1=='handle')|(set2=='handle'):
    pass
   else:
    union=set1 | set2
    print_result(union,'Field11 OR Field2')

  # ---elif No. 4---
  elif ' NOT ' in re.findall(' NOT ',query_input):
   sqt=not_split(query_input)
   set1=split_field(sqt[0],to_mapping,from_mapping,date_mapping,subject_mapping,body_mapping)
   set2=split_field(sqt[1],to_mapping,from_mapping,date_mapping,subject_mapping,body_mapping)
   if (set1=='handle')|(set2=='handle'):
    pass
   else:
    complement=set1 - set2
    print_result(complement,'Field1 NOT Field2')

  else:
   set1=split_field(query_input,to_mapping,from_mapping,date_mapping,subject_mapping,body_mapping)
   if set1=='handle':
    pass
   else:
    print_result(set1,'Single Word Query')

if __name__ == '__main__':
 main()

# --for any bug or question, 
# --please mail to fkjiang@mail.ustc.edu.cn
#=================================================================


  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值