searchPhrase函数实现了短语查询功能:
def word_split(text):
"""
Split a text in words. Returns a list of tuple that contains
(word, location) location is the starting words position of the word.
alse do the job of normalization
"""
word_list = []
wcurrent = []
windex = 0
#enumerate can get the index and the specific content of a string
for i, c in enumerate(text):
if c.isalnum():
wcurrent.append(c)
elif wcurrent:
windex = windex + 1
word = u''.join(wcurrent).lower()
word_list.append((windex, word))
wcurrent = []
if wcurrent:
windex = windex + 1
word = u''.join(wcurrent).lower()
word_list.append((windex, word))
return word_list
def inverted_index(text):
"""
Create an Inverted-Index of the specified text document.
{word:[locations]}
"""
inverted = {}
for index, word in word_split(text):
# setdefault func is similar with the func get,but it can add new key and set default value to the dic when the key does not exist
locations = inverted.setdefault(word, [])
locations.append(index)
return inverted
def inverted_index_add(inverted, doc_id, doc_index):
"""
Add Invertd-Index doc_index of the document doc_id to the
Multi-Document Inverted-Index (inverted),
using doc_id as document identifier.
{word:{doc_id:[locations]}}
"""
for word, locations in doc_index.iteritems():
indices = inverted.setdefault(word, {})
indices[doc_id] = locations
return inverted
def search(inverted, query):
"""
Returns a set of documents id that contains all the words in your query.
"""
words = [word for _, word in word_split(query) if word in inverted]
results = [set(inverted[word].keys()) for word in words]#Duplicate remove
return reduce(lambda x, y: x & y, results) if results else []#find the doc in common
def searchPhrase(inverted,query):
"""
Returns a set of documents id that contains phrase in your query.
"""
words = [word for _, word in word_split(query) if word in inverted]
tempDic = {}
doc_return = []
for word in words:
word_doc_ids = inverted[word].keys()
tempDic.setdefault(word,{})
for ID in word_doc_ids:
word_doc_position = inverted[word][ID]
tempDic[word].setdefault(ID,word_doc_position)
#print tempDic
if len(words)>1:
minKey = {}
for i in range(0,len(words)):
tempKeys = tempDic[words[i]].keys()
minKey.setdefault(i,tempKeys)
minKeyNew = minKey[0]
for i in range(1,len(words)):
minKeyNew = [val for val in minKeyNew if val in minKey[i]]
for key in minKeyNew:
list1 = tempDic[words[0]][key]
tempPosition = []
for i in range(1,len(words)):
listN = tempDic[words[i]][key]
index1 = 0
indexN = 0
while listN[indexN]-list1[index1] != i:
if listN[indexN]>list1[index1]:
index1 = index1+1
if index1 == len(list1):
index1 = index1 -1
break
else:
indexN = indexN + 1
if indexN == len(listN):
indexN = indexN - 1
break
if list1[index1] not in tempPosition and listN[indexN]-list1[index1] == i:
tempPosition.append(list1[index1])
#print tempPosition,"tempPosition"
isAdd = []
for i in range(0,len(tempPosition)):
isAddForOneGroup = []
for m in range(1,len(words)):
if tempPosition[i]+m not in tempDic[words[m]][key]:
isAddForOneGroup.append(0)
if 0 in isAddForOneGroup:
isAdd.append(0)
else:
isAdd.append(1)
if 1 in isAdd:
doc_return.append(key)
else:
doc_return.append(tempDic[words[0]].keys()[0])
results = []
for doc_id in doc_return:
if doc_id not in results:
results.append(doc_id)
return results
doc1 = """
Niners head coach Mike Singletary will let Alex Smith remain his starting
quarterback, but his vote of confidence is anything but a long-term mandate.
Smith now will work on a week-to-week basis, because Singletary has voided
his year-long lease on the job.
"I think from this point on, you have to do what's best for the football team,"
Singletary said Monday, one day after threatening to bench Smith during a
27-24 loss to the visiting Eagles.
"""
doc2 = """
The fifth edition of West Coast Green, a conference focusing on "green" home
innovations and products, rolled into San Francisco's Fort Mason last week
intent, per usual, on making our living spaces more environmentally friendly
- one used-tire house at a time.Zero-rated buildings
To that end, there were presentations on topics such as water efficiency and
the burgeoning future of Net Zero-rated buildings that consume no energy and
produce no carbon emissions.on a job,on the job
"""
inverted = {}
documents = {'doc1':doc1, 'doc2':doc2}
for doc_id, text in documents.iteritems():
doc_index = inverted_index(text)
inverted_index_add(inverted, doc_id, doc_index)
# Print Inverted-Index
#for word, doc_locations in inverted.iteritems():
#print word, doc_locations
#search common words
print "*****search common words*****"
queries = ['Week', 'Niners', 'coast']
for query in queries:
result_docs = search(inverted, query)
print "Search for '%s': %r" % (query, result_docs)
#search phrases
print
print "*****search phrases*****"
newQueries = ['Zero-rated buildings', 'on the job', 'West Coast']
for query in newQueries:
result_docs = searchPhrase(inverted, query)
print "Search for '%s': %r" % (query, result_docs)