50万邮件文本分域检索与查询的python实现（6）

最新推荐文章于 2022-05-03 10:19:05 发布

谷堆间的驴子

最新推荐文章于 2022-05-03 10:19:05 发布

阅读量607

点赞数

分类专栏： python 文章标签： python query token date input list

本文链接：https://blog.csdn.net/woshishuizzz/article/details/7985761

版权

python 专栏收录该内容

8 篇文章 0 订阅

订阅专栏

最后一节介绍分区域布尔查询的实现

分区域查询现在可以支持单个词查询（xxx.region），OR（xxx1.region1 OR xxx2.region2），NOT（xxx1.region1NOT xxx2.region2， NOT xxx1.region1），AND（xxx1.region1 AND xxx2.region2，xxx1.region1 AND xxx2.region2AND xxx3.region3）这几种形式，region可以为‘to’、‘from’、‘subject’、‘body’、‘date’。

对于每个布尔查询，首先确定其查询类型，然后再调用相应的子程序进行处理。以OR为例，or_split函数返回的是分词的结果，即xxx1.region1和xxx2.region2，然后对于每个词，调用split_field进行分域，将xxx和region分开，然后到相应的region里检索词条xxx。由于是OR查询，因此对两个集合取并集后打印出结果。

懒得写了，直接给出代码，代码写的比较烂。

#================================================================
# =====================Multi-field Boolean Query================= 

import re
import pickle

#-------------------------
# ------functions------

#---func 1---
# import id-doc list from 'dbase_doc_id'
def opendb():
 opdb=open('dbase_id_doc','r')
 iddoc=pickle.load(opdb)
 return iddoc 

#---func 2---
# Check the splited tokens whether in the given mapping-dictionary or not.
def check_existence(word,mapping):
 if word not in mapping.keys():
  print '* Opps, doesn\'t have token: \'',word,'\''
  print '* Please Try again...'
  return False
 else:
  return True

#---func 3---
# Split Field and Token ,like 'jfk.subject'->token:'jfk', field:'subject',
# and put the relevant docs (with, for example:'jfk') into 'set'.

def split_field(token_input,to_m,f_m,d_m,s_m,b_m):

 to_mapping=to_m
 from_mapping=f_m
 date_mapping=d_m
 subject_mapping=s_m
 body_mapping=b_m
 qu_tt=''
 content=''
 
 #--the explaination of the regular expression is shown in main()
 qu_t=re.findall('\.[\w]+$',token_input)
 cnt=re.findall('.*[\.]',token_input)
 for i in cnt:
  cnt_tmp=i[0:-1]
  content=cnt_tmp
   
 if qu_t=='':
  print '* Not given field! Try again. (Correct field are: .to, .from, .date, .subject, .body)'
  #---patient! not 'pass', the difference between 'return' and 'pass'
  #-------------------------------------------------------------------------------------
  #---'handle'is an import signal, telling main() to deal with the error, 
  #---rather than the empty 'set' type.
  #---(for the Return Value of split_field() is given to a variable,
  #---if the variable further given to a funtion ,then will cause Mismatch Type error!)
  #-------------------------------------------------------------------------------------
  return 'handle'
 else:
  for i in qu_t:
   qu_tt=i[1:]

 dbases={'to':to_mapping,'from':from_mapping,'date':date_mapping,'subject':subject_mapping,'body':body_mapping,}
 if qu_tt not in dbases.keys():
  print '* Field input error! Try again. (Correct field are: .to, .from, .date, .subject, .body)'
  return 'handle'
 else:
  # dbases[qu_tt] is mapping-dictionary name
  print '* -- Query Field is :\'',qu_tt,'\' ,Token is:\'',content,'\' --'
  if check_existence(content,dbases[qu_tt])==True:
   set_doc=set()
   for item in dbases[qu_tt][content]:
    set_doc.add(item[0])
   #--Pay attention to the location of 'return set_doc',or will cause only one output--
   return set_doc
  else:
   return 'handle'

   
#---func 4---
# special function for AND, get the query field
def get_AND_field(token_input,to_m,f_m,d_m,s_m,b_m):

 to_mapping=to_m
 from_mapping=f_m
 date_mapping=d_m
 subject_mapping=s_m
 body_mapping=b_m
 qu_tt=''
 qu_t=re.findall('\.[\w]+$',token_input)
 for i in qu_t:
  qu_tt=i[1:]
 dbases={'to':to_mapping,'from':from_mapping,'date':date_mapping,'subject':subject_mapping,'body':body_mapping,}
 return dbases[qu_tt]

#---func 5---
# special function for AND, get the query content
def get_AND_content(token_input,to_m,f_m,d_m,s_m,b_m):

 to_mapping=to_m
 from_mapping=f_m
 date_mapping=d_m
 subject_mapping=s_m
 body_mapping=b_m
 content=''
 cnt=re.findall('.*[\.]',token_input)
 for i in cnt:
  cnt_tmp=i[0:-1]
  content=cnt_tmp
 return content


#---func 6,7,8---
# If you input an query string containing bool word: AND, NOT, and OR, 
# then function "*_split()" splits it into two words.
# For example,'jfk1.subject AND jfk2.subject' -> ['jfk1.subject','jfk2.subject']
# Attention: 
#  query like 'NOT jfk' is not included here, it will be processed in a different way (Check main() for details).

def and_split(query_token):
 query=re.split(' AND ',query_token)
 return query

def or_split(query_token):
 query=re.split(' OR ',query_token)
 return query

def not_split(query_token):
 query=re.split(' NOT ',query_token)
 return query

#---func 9---
# Print the query result.
def print_result(set,bool_identifier):
 print '****Hit docs for query: \'',bool_identifier,'\'****'

 id_doc=opendb()
 operation=['Field1 AND Field2 AND Field3','Field1 AND Field2']
 if bool_identifier in operation:
  print 'calculate the Cosine Similarity between each hit doc and query vector (just using \'tf\')'
  print 'Outputing the sorted result:'
  result_num=raw_input('< How many results do you want to print?(input \'all\' print all)>  ')
  if result_num=='all':
   m=1
   for i in set:
    print m,': ',id_doc[i[0]],'	  ( cosine similarity:',i[1],')'
    m=m+1
   print '****************************************************************'
  else:
   m=1
   for i in set:
    print m,': ',id_doc[i[0]],'	  ( cosine similarity:',i[1],')'
    m=m+1
    if m <= int(result_num):
     continue
    else:
     break
   print '****************************************************************'
 
 else:
  m=1
  for i in set:
   print m,': ',id_doc[i]
   m=m+1
  print '****************************************************************'

# --definations end--
#------------------------


def main():
 
 #-----init-----
 print ' ( ---- Query Support Multi-Field ----- )'
 print '* Import pickle files :\'dbase_to\'|\'dbase_from\'|\'dbase_date\'|\'dbase_subject\'|\'dbase_body\' from hard disk'
 print '* Load inverted lists :\'To\'|\'From\'|\'Date\'|\'Subject\'|\'mail body\' into memory. '
 print '* Please waiting...'
  
 db_to=open('dbase_to','r')
 to_mapping=pickle.load(db_to)
 print '* \'dbase_to\' Done!'
  
 db_from=open('dbase_from','r')
 from_mapping=pickle.load(db_from)
 print '* \'dbase_from\' Done!'
  
 db_date=open('dbase_date','r')
 date_mapping=pickle.load(db_date)
 print '* \'dbase_date\' Done!'
  
 db_subject=open('dbase_subject','r')
 subject_mapping=pickle.load(db_subject)
 print '* \'dbase_subject\' Done!'
  
 db_body=open('dbase_body','r')
 body_mapping=pickle.load(db_body)
 print '* \'dbase_body\' Done!'
  
 print '* Now, you can query!'
 print '* ( query like \'cash.body AND NOV.date\', fields MUST BE in Low Case, and MUST have one )'
 # ---init end----

 while True:
  query_input=raw_input('Please input query string(support Multi-Field boolean query \'AND\'|\'OR\'|\'NOT\', \'q\' to quit): ')
  
  if query_input=='q':
   exit()
  
  # ---elif No. 1----
  elif 'NOT ' in re.findall('^NOT ',query_input):
   # qu_tt is represent for field
   # content is query token
   qu_tt=''
   content=''
   
   #--1.get rid of 'NOT '
   query_input_tmp=query_input[4:]
   #--2.match '~.ddd' from end of the string
   qu_t=re.findall('\.[\w]+$',query_input_tmp)
   #--3.match the string till meet the last '.'
   cnt=re.findall('.*[\.]',query_input_tmp)
   #--4.because cnt is a list, even just has one element
   for i in cnt:
    #--5.get rid of the last '.' kept in the string generated in 3.
    cnt_tmp=i[0:-1]
    content=cnt_tmp
   
   if qu_t=='':
    print '* Not given field! Try again. (Correct field are: .to, .from, .date, .subject, .body)'
    #---patient! not 'return', the difference between 'return' and 'pass'
    pass
   else:
    for i in qu_t:
     qu_tt=i[1:]
   
   dbases={'to':to_mapping,'from':from_mapping,'date':date_mapping,'subject':subject_mapping,'body':body_mapping,}
   # ------if outside-----
   if qu_tt not in dbases.keys():
    print '* Field input error! Try again. (Correct field are: .to, .from, .date, .subject, .body)'
    pass
   else:
    # dbases[qu_tt]
    print '* -- Query Field is \'',qu_tt,'\', Token is:\'',content,'\' -- '
    # ----if inside----
    if check_existence(content,dbases[qu_tt])==True:
     print '****Hit docs for query: \'NOT',content,'\' in field \'',qu_tt,'\'****'
   
     complement_set=set()
     set_not=set()
     for item in dbases[qu_tt][content]:
      set_not.add(item[0])
    
     union_set=set()
     for key in dbases[qu_tt].keys():
      set_tmp=set()
      for doc in dbases[qu_tt][key]:
       set_tmp.add(doc[0])
      union_set=union_set | set_tmp
     complement_set=union_set - set_not
   
     id_doc=opendb()

     i=1
     for doc in complement_set:
      print i,': ',id_doc[doc]
      i=i+1
     print '******************************************'
   
    else:
     pass
    # ---if inside end---
   # ---if outside end---

  # ---elif No. 2---
  elif ' AND ' in re.findall(' AND ',query_input):
   sqt=and_split(query_input)
   #--dealing with 3 words AND query 
   if len(sqt)==3:
    set1=split_field(sqt[0],to_mapping,from_mapping,date_mapping,subject_mapping,body_mapping)
    set2=split_field(sqt[1],to_mapping,from_mapping,date_mapping,subject_mapping,body_mapping)
    set3=split_field(sqt[2],to_mapping,from_mapping,date_mapping,subject_mapping,body_mapping)
    if (set1=='handle')|(set2=='handle')|(set3=='handle'):
     pass
    else:
     intersection=set1 & set2 & set3
     # ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
     # consine similarity of every hit doc and the query vector
     map_cnt={}
     for i in intersection:
      tknum1=0
      tknum2=0
      tknum3=0
      map1=get_AND_field(sqt[0],to_mapping,from_mapping,date_mapping,subject_mapping,body_mapping)
      cont1=get_AND_content(sqt[0],to_mapping,from_mapping,date_mapping,subject_mapping,body_mapping)
      for token1 in map1[cont1]:
       if i==token1[0]:
        tknum1=token1[1]
        continue
      map2=get_AND_field(sqt[1],to_mapping,from_mapping,date_mapping,subject_mapping,body_mapping)
      cont2=get_AND_content(sqt[1],to_mapping,from_mapping,date_mapping,subject_mapping,body_mapping)
      for token2 in map2[cont2]:
       if i==token2[0]:
        tknum2=token2[1]
        continue
      map3=get_AND_field(sqt[2],to_mapping,from_mapping,date_mapping,subject_mapping,body_mapping)
      cont3=get_AND_content(sqt[2],to_mapping,from_mapping,date_mapping,subject_mapping,body_mapping)
      for token3 in map3[cont3]:
       if i==token3[0]:
        tknum3=token3[1]
        continue
      # cosine similarity
      tk=tknum1*1+tknum2*1+tknum3*1
      map_cnt.setdefault(i,[]).append(tk)
     
     # sort 
     sorted_list=[]
     sorted_list=sorted(map_cnt.iteritems(), key=lambda a:a[1], reverse=True)
     # +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
     print_result(sorted_list,'Field1 AND Field2 AND Field3')

   else:    
    set1=split_field(sqt[0],to_mapping,from_mapping,date_mapping,subject_mapping,body_mapping)
    set2=split_field(sqt[1],to_mapping,from_mapping,date_mapping,subject_mapping,body_mapping)
    if (set1=='handle')|(set2=='handle'):
     pass
    else:
     intersection=set1 & set2
     # ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
     # consine similarity of every hit doc and the query vector
     map_cnt={}
     for i in intersection:
      tknum1=0
      tknum2=0
      map1=get_AND_field(sqt[0],to_mapping,from_mapping,date_mapping,subject_mapping,body_mapping)
      cont1=get_AND_content(sqt[0],to_mapping,from_mapping,date_mapping,subject_mapping,body_mapping)
      for token1 in map1[cont1]:
       if i==token1[0]:
        tknum1=token1[1]
        continue
      map2=get_AND_field(sqt[1],to_mapping,from_mapping,date_mapping,subject_mapping,body_mapping)
      cont2=get_AND_content(sqt[1],to_mapping,from_mapping,date_mapping,subject_mapping,body_mapping)
      for token2 in map2[cont2]:
       if i==token2[0]:
        tknum2=token2[1]
        continue
      # cosine similarity
      tk=tknum1*1+tknum2*1
      map_cnt.setdefault(i,[]).append(tk)
     
     # sort 
     sorted_list=[]
     sorted_list=sorted(map_cnt.iteritems(), key=lambda a:a[1], reverse=True)
     # +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
     print_result(sorted_list,'Field1 AND Field2')

  # ---elif No. 3---
  elif ' OR ' in re.findall(' OR ',query_input):
   sqt=or_split(query_input)
   set1=split_field(sqt[0],to_mapping,from_mapping,date_mapping,subject_mapping,body_mapping)
   set2=split_field(sqt[1],to_mapping,from_mapping,date_mapping,subject_mapping,body_mapping)
   if (set1=='handle')|(set2=='handle'):
    pass
   else:
    union=set1 | set2
    print_result(union,'Field11 OR Field2')

  # ---elif No. 4---
  elif ' NOT ' in re.findall(' NOT ',query_input):
   sqt=not_split(query_input)
   set1=split_field(sqt[0],to_mapping,from_mapping,date_mapping,subject_mapping,body_mapping)
   set2=split_field(sqt[1],to_mapping,from_mapping,date_mapping,subject_mapping,body_mapping)
   if (set1=='handle')|(set2=='handle'):
    pass
   else:
    complement=set1 - set2
    print_result(complement,'Field1 NOT Field2')

  else:
   set1=split_field(query_input,to_mapping,from_mapping,date_mapping,subject_mapping,body_mapping)
   if set1=='handle':
    pass
   else:
    print_result(set1,'Single Word Query')

if __name__ == '__main__':
 main()

# --for any bug or question, 
# --please mail to fkjiang@mail.ustc.edu.cn
#=================================================================

谷堆间的驴子

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
50万邮件文本分域检索与查询的python实现（6）

最后一节介绍分区域布尔查询的实现分区域查询现在可以支持单个词查询（xxx.region），OR（xxx1.region1 OR xxx2.region2），NOT（xxx1.region1NOT xxx2.region2， NOT xxx1.region1），AND（xxx1.region1 AND xxx2.region2，xxx1.region1 AND xxx2.region2AND
复制链接

扫一扫

专栏目录