# 需要導入模塊: import jieba [as 別名]
# 或者: from jieba import lcut_for_search [as 別名]
def make_inverted_index(filename,read_buff_size,output_file_record_size,web_record_numbers=100000):
"""
:param filename: ?????????.txt
:param read_buff_size:????????????
:param output_file_token_size:???????????????
:param ????????????????????? ??????
:return:??????
"""
#??????????????????
block_read=read_block(read_buff_size,filename)
punct = set(u"""/+%#:!),.:;?]}¢""????????????????
?????????????????????????????
??•·???--?’”([{£¥""??????????????????
?????????“"-—_…""")
Letters_and_numbers=set("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789")
buff_dir=filename[:-4]+"_buff" #?????????? ?????????????????????????
if os.path.exists(buff_dir):
pass
else:
os.mkdir(buff_dir)
file_numbers=1
while True:
print "process :cuting word +making inverted_index files---->>>>",file_numbers*(output_file_record_size)*1.0/web_record_numbers
spimi=SPIMI_Invert(buff_dir+"/"+str(file_numbers)+".txt")
count=0
while True:
doc_id,content=block_read.pop_token()
if content==""or count==output_file_record_size:
break
content_list=jieba.lcut_for_search(content)
spimi.push_id(doc_id)
for j in range(len(content_list)):
if content_list[j] not in punct and content_list[j] not in Letters_and_numbers :
spimi.push_word(content_list[j])
del content_list,doc_id,content
count+=1
spimi.push_word("")#?? ?????
file_numbers+=1
if content=="":
break
print ("process :cuting word +making inverted_index files---->>>>Finish")
#????????
merged_filename=merge_inverted_files.merge_file([str(i) for i in range(1,file_numbers)],read_buff_size,buff_dir+"/")
print "process:mergeing inverted index files----->Finish"
#????????? ?-??????
Dictionary.establish_ditionary(buff_dir+"/"+merged_filename+".txt",read_buff_size,buff_dir+"/"+"Dictionary.txt")
shutil.copy(buff_dir+"/"+merged_filename+".txt",filename[:-4]+"_inverted_index.txt")#????
shutil.copy(buff_dir+"/"+"Dictionary.txt",filename[:-4]+"_index_Dictionary.txt")
shutil.rmtree(buff_dir)#?????
del merged_filename,buff_dir,punct,Letters_and_numbers