利用结巴分词进行中文分词,选择全模式,建立词倒排索引,并实现一般多词查询和短语查询
# -*- coding: utf-8 -*-
import jieba
'''
Created on 2015-11-23
'''
def word_split(text):
"""
Split a text in words. Returns a list of tuple that contains
(word, location) location is the starting byte position of the word.
"""
word_list = []
windex = 0
word_primitive = jieba.cut(text, cut_all = True)
for word in word_primitive:
if len(word) > 0:
word_list.append((windex, word))
windex += 1
return word_list
def inverted_index(text):
"""
Create an Inverted-Index of the specified text document.
{word:[locations]}
"""
inverted = {}
for index, word in word_split(text):
locations = inverted.setdefault(word, [])
locations.append(index)
return inverted
def inverted_index_add(inverted, doc_id, doc_index):
"""
Add Invertd-Index doc_index of the document doc_id to the
Multi-Document Inverted-Index (inverted),
using doc_id as document identifier.
{word:{doc_id:[locations]}}
"""
for word, locations in doc_index.iteritems():
indices = inverted.setdefault(word, {})
indices[doc_id] = locations
return inverted
def search_a_word(inverted, word):
"""
search one word
"""
word = word.decode('utf-8')
if word not in inverted:
return None
else:
word_index = inverted[word]
return word_index
def search_words(inverted, wordList):
"""
search more than one word
"""
wordDic = []
docRight = []
for word in wordList:
if isinstance(word, str):
word = word.decode('utf-8')
if word not in inverted:
return None
else:
element = inverted[wor