最后一节介绍分区域布尔查询的实现
分区域查询现在可以支持单个词查询(xxx.region),OR(xxx1.region1 OR xxx2.region2),NOT(xxx1.region1NOT xxx2.region2, NOT xxx1.region1),AND(xxx1.region1 AND xxx2.region2,xxx1.region1 AND xxx2.region2AND xxx3.region3)这几种形式,region可以为‘to’、‘from’、‘subject’、‘body’、‘date’。
对于每个布尔查询,首先确定其查询类型,然后再调用相应的子程序进行处理。以OR为例,or_split函数返回的是分词的结果,即xxx1.region1和xxx2.region2,然后对于每个词,调用split_field进行分域,将xxx和region分开,然后到相应的region里检索词条xxx。由于是OR查询,因此对两个集合取并集后打印出结果。
懒得写了,直接给出代码,代码写的比较烂。
#================================================================
# =====================Multi-field Boolean Query=================
import re
import pickle
#-------------------------
# ------functions------
#---func 1---
# import id-doc list from 'dbase_doc_id'
def opendb():
opdb=open('dbase_id_doc','r')
iddoc=pickle.load(opdb)
return iddoc
#---func 2---
# Check the splited tokens whether in the given mapping-dictionary or not.
def check_existence(word,mapping):
if word not in mapping.keys():
print '* Opps, doesn\'t have token: \'',word,'\''
print '* Please Try again...'
return False
else:
return True
#---func 3---
# Split Field and Token ,like 'jfk.subject'->token:'jfk', field:'subject',
# and put the relevant docs (with, for example:'jfk') into 'set'.
def split_field(token_input,to_m,f_m,d_m,s_m,b_m):
to_mapping=to_m
from_mapping=f_m
date_mapping=d_m
subject_mapping=s_m
body_mapping=b_m
qu_tt=''
content=''
#--the explaination of the regular expression is shown in main()
qu_t=re.findall('\.[\w]+$',token_input)
cnt=re.findall('.*[\.]',token_input)
for i in cnt:
cnt_tmp=i[0:-1]
content=cnt_tmp
if qu_t=='':
print '* Not given field! Try again. (Correct field are: .to, .from, .date, .subject, .body)'
#---patient! not 'pass', the difference between 'return' and 'pass'
#-------------------------------------------------------------------------------------
#---'handle'is an import signal, telling main() to deal with the error,
#---rather than the empty 'set' type.
#---(for the Return Value of split_field() is given to a variable,
#---if the variable further given to a funtion ,then will cause Mismatch Type error!)
#-------------------------------------------------------------------------------------
return 'handle'
else:
for i in qu_t:
qu_tt=i[1:]
dbases={'to':to_mapping,'from':from_mapping,'date':date_mapping,'subject':subject_mapping,'body':body_mapping,}
if qu_tt not in dbases.keys():
print '* Field input error! Try again. (Correct field are: .to, .from, .date, .subject, .body)'
return 'handle'
else:
# dbases[qu_tt] is mapping-dictionary name
print '* -- Query Field is :\'',qu_tt,'\' ,Token is:\'',content,'\' --'
if check_existence(content,dbases[qu_tt])==True:
set_doc=set()
for item in dbases[qu_tt][content]:
set_doc.add(item[0])
#--Pay attention to the location of 'return set_doc',or will cause only one output--
return set_doc
else:
return 'handle'
#---func 4---
# special function for AND, get the query field
def get_AND_field(token_input,to_m,f_m,d_m,s_m,b_m):
to_mapping=to_m
from_mapping=f_m
date_mapping=d_m
subject_mapping=s_m
body_mapping=b_m
qu_tt=''
qu_t=re.findall('\.[\w]+$',token_input)
for i in qu_t:
qu_tt=i[1:]
dbases={'to':to_mapping,'from':from_mapping,'date':date_mapping,'subject':subject_mapping,'body':body_mapping,}
return dbases[qu_tt]
#---func 5---
# special function for AND, get the query content
def get_AND_content(token_input,to_m,f_m,d_m,s_m,b_m):
to_mapping=to_m
from_mapping=f_m
date_mapping=d_m
subject_mapping=s_m
body_mapping=b_m
content=''
cnt=re.findall('.*[\.]',token_input)
for i in cnt:
cnt_tmp=i[0:-1]
content=cnt_tmp
return content
#---func 6,7,8---
# If you input an query string containing bool word: AND, NOT, and OR,
# then function "*_split()" splits it into two words.
# For example,'jfk1.subject AND jfk2.subject' -> ['jfk1.subject','jfk2.subject']
# Attention:
# query like 'NOT jfk' is not included here, it will be processed in a different way (Check main() for details).
def and_split(query_token):
query=re.split(' AND ',query_token)
return query
def or_split(query_token):
query=re.split(' OR ',query_token)
return query
def not_split(query_token):
query=re.split(' NOT ',query_token)
return query
#---func 9---
# Print the query result.
def print_result(set,bool_identifier):
print '****Hit docs for query: \'',bool_identifier,'\'****'
id_doc=opendb()
operation=['Field1 AND Field2 AND Field3','Field1 AND Field2']
if bool_identifier in operation:
print 'calculate the Cosine Similarity between each hit doc and query vector (just using \'tf\')'
print 'Outputing the sorted result:'
result_num=raw_input('< How many results do you want to print?(input \'all\' print all)> ')
if result_num=='all':
m=1
for i in set:
print m,': ',id_doc[i[0]],' ( cosine similarity:',i[1],')'
m=m+1
print '****************************************************************'
else:
m=1
for i in set:
print m,': ',id_doc[i[0]],' ( cosine similarity:',i[1],')'
m=m+1
if m <= int(result_num):
continue
else:
break
print '****************************************************************'
else:
m=1
for i in set:
print m,': ',id_doc[i]
m=m+1
print '****************************************************************'
# --definations end--
#------------------------
def main():
#-----init-----
print ' ( ---- Query Support Multi-Field ----- )'
print '* Import pickle files :\'dbase_to\'|\'dbase_from\'|\'dbase_date\'|\'dbase_subject\'|\'dbase_body\' from hard disk'
print '* Load inverted lists :\'To\'|\'From\'|\'Date\'|\'Subject\'|\'mail body\' into memory. '
print '* Please waiting...'
db_to=open('dbase_to','r')
to_mapping=pickle.load(db_to)
print '* \'dbase_to\' Done!'
db_from=open('dbase_from','r')
from_mapping=pickle.load(db_from)
print '* \'dbase_from\' Done!'
db_date=open('dbase_date','r')
date_mapping=pickle.load(db_date)
print '* \'dbase_date\' Done!'
db_subject=open('dbase_subject','r')
subject_mapping=pickle.load(db_subject)
print '* \'dbase_subject\' Done!'
db_body=open('dbase_body','r')
body_mapping=pickle.load(db_body)
print '* \'dbase_body\' Done!'
print '* Now, you can query!'
print '* ( query like \'cash.body AND NOV.date\', fields MUST BE in Low Case, and MUST have one )'
# ---init end----
while True:
query_input=raw_input('Please input query string(support Multi-Field boolean query \'AND\'|\'OR\'|\'NOT\', \'q\' to quit): ')
if query_input=='q':
exit()
# ---elif No. 1----
elif 'NOT ' in re.findall('^NOT ',query_input):
# qu_tt is represent for field
# content is query token
qu_tt=''
content=''
#--1.get rid of 'NOT '
query_input_tmp=query_input[4:]
#--2.match '~.ddd' from end of the string
qu_t=re.findall('\.[\w]+$',query_input_tmp)
#--3.match the string till meet the last '.'
cnt=re.findall('.*[\.]',query_input_tmp)
#--4.because cnt is a list, even just has one element
for i in cnt:
#--5.get rid of the last '.' kept in the string generated in 3.
cnt_tmp=i[0:-1]
content=cnt_tmp
if qu_t=='':
print '* Not given field! Try again. (Correct field are: .to, .from, .date, .subject, .body)'
#---patient! not 'return', the difference between 'return' and 'pass'
pass
else:
for i in qu_t:
qu_tt=i[1:]
dbases={'to':to_mapping,'from':from_mapping,'date':date_mapping,'subject':subject_mapping,'body':body_mapping,}
# ------if outside-----
if qu_tt not in dbases.keys():
print '* Field input error! Try again. (Correct field are: .to, .from, .date, .subject, .body)'
pass
else:
# dbases[qu_tt]
print '* -- Query Field is \'',qu_tt,'\', Token is:\'',content,'\' -- '
# ----if inside----
if check_existence(content,dbases[qu_tt])==True:
print '****Hit docs for query: \'NOT',content,'\' in field \'',qu_tt,'\'****'
complement_set=set()
set_not=set()
for item in dbases[qu_tt][content]:
set_not.add(item[0])
union_set=set()
for key in dbases[qu_tt].keys():
set_tmp=set()
for doc in dbases[qu_tt][key]:
set_tmp.add(doc[0])
union_set=union_set | set_tmp
complement_set=union_set - set_not
id_doc=opendb()
i=1
for doc in complement_set:
print i,': ',id_doc[doc]
i=i+1
print '******************************************'
else:
pass
# ---if inside end---
# ---if outside end---
# ---elif No. 2---
elif ' AND ' in re.findall(' AND ',query_input):
sqt=and_split(query_input)
#--dealing with 3 words AND query
if len(sqt)==3:
set1=split_field(sqt[0],to_mapping,from_mapping,date_mapping,subject_mapping,body_mapping)
set2=split_field(sqt[1],to_mapping,from_mapping,date_mapping,subject_mapping,body_mapping)
set3=split_field(sqt[2],to_mapping,from_mapping,date_mapping,subject_mapping,body_mapping)
if (set1=='handle')|(set2=='handle')|(set3=='handle'):
pass
else:
intersection=set1 & set2 & set3
# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
# consine similarity of every hit doc and the query vector
map_cnt={}
for i in intersection:
tknum1=0
tknum2=0
tknum3=0
map1=get_AND_field(sqt[0],to_mapping,from_mapping,date_mapping,subject_mapping,body_mapping)
cont1=get_AND_content(sqt[0],to_mapping,from_mapping,date_mapping,subject_mapping,body_mapping)
for token1 in map1[cont1]:
if i==token1[0]:
tknum1=token1[1]
continue
map2=get_AND_field(sqt[1],to_mapping,from_mapping,date_mapping,subject_mapping,body_mapping)
cont2=get_AND_content(sqt[1],to_mapping,from_mapping,date_mapping,subject_mapping,body_mapping)
for token2 in map2[cont2]:
if i==token2[0]:
tknum2=token2[1]
continue
map3=get_AND_field(sqt[2],to_mapping,from_mapping,date_mapping,subject_mapping,body_mapping)
cont3=get_AND_content(sqt[2],to_mapping,from_mapping,date_mapping,subject_mapping,body_mapping)
for token3 in map3[cont3]:
if i==token3[0]:
tknum3=token3[1]
continue
# cosine similarity
tk=tknum1*1+tknum2*1+tknum3*1
map_cnt.setdefault(i,[]).append(tk)
# sort
sorted_list=[]
sorted_list=sorted(map_cnt.iteritems(), key=lambda a:a[1], reverse=True)
# +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
print_result(sorted_list,'Field1 AND Field2 AND Field3')
else:
set1=split_field(sqt[0],to_mapping,from_mapping,date_mapping,subject_mapping,body_mapping)
set2=split_field(sqt[1],to_mapping,from_mapping,date_mapping,subject_mapping,body_mapping)
if (set1=='handle')|(set2=='handle'):
pass
else:
intersection=set1 & set2
# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
# consine similarity of every hit doc and the query vector
map_cnt={}
for i in intersection:
tknum1=0
tknum2=0
map1=get_AND_field(sqt[0],to_mapping,from_mapping,date_mapping,subject_mapping,body_mapping)
cont1=get_AND_content(sqt[0],to_mapping,from_mapping,date_mapping,subject_mapping,body_mapping)
for token1 in map1[cont1]:
if i==token1[0]:
tknum1=token1[1]
continue
map2=get_AND_field(sqt[1],to_mapping,from_mapping,date_mapping,subject_mapping,body_mapping)
cont2=get_AND_content(sqt[1],to_mapping,from_mapping,date_mapping,subject_mapping,body_mapping)
for token2 in map2[cont2]:
if i==token2[0]:
tknum2=token2[1]
continue
# cosine similarity
tk=tknum1*1+tknum2*1
map_cnt.setdefault(i,[]).append(tk)
# sort
sorted_list=[]
sorted_list=sorted(map_cnt.iteritems(), key=lambda a:a[1], reverse=True)
# +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
print_result(sorted_list,'Field1 AND Field2')
# ---elif No. 3---
elif ' OR ' in re.findall(' OR ',query_input):
sqt=or_split(query_input)
set1=split_field(sqt[0],to_mapping,from_mapping,date_mapping,subject_mapping,body_mapping)
set2=split_field(sqt[1],to_mapping,from_mapping,date_mapping,subject_mapping,body_mapping)
if (set1=='handle')|(set2=='handle'):
pass
else:
union=set1 | set2
print_result(union,'Field11 OR Field2')
# ---elif No. 4---
elif ' NOT ' in re.findall(' NOT ',query_input):
sqt=not_split(query_input)
set1=split_field(sqt[0],to_mapping,from_mapping,date_mapping,subject_mapping,body_mapping)
set2=split_field(sqt[1],to_mapping,from_mapping,date_mapping,subject_mapping,body_mapping)
if (set1=='handle')|(set2=='handle'):
pass
else:
complement=set1 - set2
print_result(complement,'Field1 NOT Field2')
else:
set1=split_field(query_input,to_mapping,from_mapping,date_mapping,subject_mapping,body_mapping)
if set1=='handle':
pass
else:
print_result(set1,'Single Word Query')
if __name__ == '__main__':
main()
# --for any bug or question,
# --please mail to fkjiang@mail.ustc.edu.cn
#=================================================================