关闭

使用python实现短语查询

标签: python倒排索引和短语查询
730人阅读 评论(1) 收藏 举报
分类:

searchPhrase函数实现了短语查询功能:

def word_split(text):
    """
    Split a text in words. Returns a list of tuple that contains
    (word, location) location is the starting words position of the word.
    alse do the job of normalization
    """
    word_list = []
    wcurrent = []
    windex = 0
    #enumerate can get the index and the specific content of a string
    for i, c in enumerate(text):
        if c.isalnum():
            wcurrent.append(c)
        elif wcurrent:
            windex = windex + 1
            word = u''.join(wcurrent).lower()
            word_list.append((windex, word))
            wcurrent = []

    if wcurrent:
        windex = windex + 1
        word = u''.join(wcurrent).lower()
        word_list.append((windex, word))

    return word_list

def inverted_index(text):
    """
    Create an Inverted-Index of the specified text document.
        {word:[locations]}
    """
    inverted = {}

    for index, word in word_split(text):
        # setdefault func is similar with the func get,but it can add new key and set default value to the dic when the key does not exist
        locations = inverted.setdefault(word, [])
        locations.append(index)

    return inverted

def inverted_index_add(inverted, doc_id, doc_index):
    """
    Add Invertd-Index doc_index of the document doc_id to the 
    Multi-Document Inverted-Index (inverted), 
    using doc_id as document identifier.
        {word:{doc_id:[locations]}}
    """
    for word, locations in doc_index.iteritems():
        indices = inverted.setdefault(word, {})
        indices[doc_id] = locations
    return inverted

def search(inverted, query):
    """
    Returns a set of documents id that contains all the words in your query.
    """
    words = [word for _, word in word_split(query) if word in inverted]
    results = [set(inverted[word].keys()) for word in words]#Duplicate remove
    return reduce(lambda x, y: x & y, results) if results else []#find the doc in common

def searchPhrase(inverted,query):
    """
    Returns a set of documents id that contains phrase in your query.
    """
    words = [word for _, word in word_split(query) if word in inverted]
    tempDic = {}
    doc_return = []
    for word in words:
        word_doc_ids =  inverted[word].keys()
        tempDic.setdefault(word,{})
        for ID in word_doc_ids:
            word_doc_position =  inverted[word][ID]
            tempDic[word].setdefault(ID,word_doc_position)
    #print tempDic
    if len(words)>1:
        minKey = {}
        for i in range(0,len(words)):
            tempKeys = tempDic[words[i]].keys()
            minKey.setdefault(i,tempKeys)
        minKeyNew = minKey[0]
        for i in range(1,len(words)):
            minKeyNew = [val for val in minKeyNew if val in minKey[i]]
        for key in minKeyNew:
            list1 = tempDic[words[0]][key]
            tempPosition = []
            for i in range(1,len(words)):
                listN = tempDic[words[i]][key]
                index1 = 0
                indexN = 0
                while listN[indexN]-list1[index1] != i:
                    if listN[indexN]>list1[index1]:
                        index1 = index1+1
                        if index1 == len(list1):
                            index1 = index1 -1
                            break
                    else:
                        indexN = indexN + 1
                        if indexN == len(listN):
                            indexN = indexN - 1
                            break
                if list1[index1] not in tempPosition and listN[indexN]-list1[index1] == i:
                    tempPosition.append(list1[index1])
            #print tempPosition,"tempPosition"
            isAdd = []
            for i in range(0,len(tempPosition)):
                isAddForOneGroup = []
                for m in range(1,len(words)):
                    if tempPosition[i]+m not in tempDic[words[m]][key]:
                        isAddForOneGroup.append(0)
                if 0 in isAddForOneGroup:
                    isAdd.append(0)
                else:
                    isAdd.append(1)
            if 1 in isAdd:
                doc_return.append(key)                          
    else:
        doc_return.append(tempDic[words[0]].keys()[0])
    results = []
    for doc_id in doc_return:
        if doc_id not in results:
            results.append(doc_id)
    return results

doc1 = """
Niners head coach Mike Singletary will let Alex Smith remain his starting 
quarterback, but his vote of confidence is anything but a long-term mandate.
Smith now will work on a week-to-week basis, because Singletary has voided 
his year-long lease on the job.
"I think from this point on, you have to do what's best for the football team,"
Singletary said Monday, one day after threatening to bench Smith during a 
27-24 loss to the visiting Eagles.
"""

doc2 = """
The fifth edition of West Coast Green, a conference focusing on "green" home 
innovations and products, rolled into San Francisco's Fort Mason last week 
intent, per usual, on making our living spaces more environmentally friendly 
- one used-tire house at a time.Zero-rated buildings 
To that end, there were presentations on topics such as water efficiency and 
the burgeoning future of Net Zero-rated buildings that consume no energy and 
produce no carbon emissions.on a job,on the job
"""
inverted = {}
documents = {'doc1':doc1, 'doc2':doc2}
for doc_id, text in documents.iteritems():
    doc_index = inverted_index(text)
    inverted_index_add(inverted, doc_id, doc_index)

# Print Inverted-Index
#for word, doc_locations in inverted.iteritems():
    #print word, doc_locations

#search common words
print "*****search common words*****"
queries = ['Week', 'Niners', 'coast']
for query in queries:
    result_docs = search(inverted, query)
    print "Search for '%s': %r" % (query, result_docs)

#search phrases
print 
print "*****search phrases*****"
newQueries = ['Zero-rated buildings', 'on the job', 'West Coast']
for query in newQueries:
    result_docs = searchPhrase(inverted, query)
    print "Search for '%s': %r" % (query, result_docs)
1
0

查看评论
* 以上用户言论只代表其个人观点,不代表CSDN网站的观点或立场
    个人资料
    • 访问:108233次
    • 积分:4301
    • 等级:
    • 排名:第7009名
    • 原创:325篇
    • 转载:46篇
    • 译文:3篇
    • 评论:25条