# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import lcut_for_search [as 别名]
def make_inverted_index(filename,read_buff_size,output_file_record_size,web_record_numbers=100000):
'''
:param filename: ?????????.txt
:param read_buff_size:????????????
:param output_file_token_size:???????????????
:param ????????????????????? ??????
:return:??????
'''
#??????????????????
block_read=read_block(read_buff_size,filename)
punct = set(u'''/+%#:!),.:;?]}¢'"????????????????
?????????????????????????????
??•·???--?’”([{£¥'"??????????????????
?????????“‘-—_…''')
Letters_and_numbers=set('abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789')
buff_dir=filename[:-4]+'_buff' #?????????? ?????????????????????????
if os.path.exists(buff_dir):
pass
else:
os.mkdir(buff_dir)
file_numbers=1
while True:
print "process :cuting word +making inverted_index files---->>>>",file_numbers*(output_file_record_size)*1.0/web_record_numbers
spimi=SPIMI_Invert(buff_dir+'/'+str(file_numbers)+'.txt')
count=0
while True:
doc_id,content=block_read.pop_token()
if content==''or count==output_file_record_size:
break
content_list=jieba.lcut_for_search(content)
spimi.push_id(doc_id)
for j in range(len(content_list)):
if content_list[j] not in punct and content_list[j] not in Letters_and_numbers :
spimi.push_word(content_list[j])
del content_list,doc_id,content
count+=1
spimi.push_word('')#?? ?????
file_numbers+=1
if content=='':
break
print ("process :cuting word +making inverted_index files---->>>>Finish")
#????????
merged_filename=merge_inverted_files.merge_file([str(i) for i in range(1,file_numbers)],read_buff_size,buff_dir+'/')
print "process:mergeing inverted index files----->Finish"
#????????? ?-??????
Dictionary.establish_ditionary(buff_dir+'/'+merged_filename+'.txt',read_buff_size,buff_dir+'/'+"Dictionary.txt")
shutil.copy(buff_dir+'/'+merged_filename+'.txt',filename[:-4]+'_inverted_index.txt')#????
shutil.copy(buff_dir+'/'+"Dictionary.txt",filename[:-4]+'_index_Dictionary.txt')
shutil.rmtree(buff_dir)#?????
del merged_filename,buff_dir,punct,Letters_and_numbers