最近学习Python 有学习任务 写一个逆向最大分词
import xlrd import codecs import os #读取所有需要分词的文件路径 def eachFile(filepath): pathDir = os.listdir(filepath) paths_set = set() for i in pathDir: paths_set.add(os.path.join('/%s'%i)) return paths_set #读取文本内容 def readtxt(filepath): with open(filepath,'r',encoding='utf8') as f: sentences = f.readlines() f.close() return sentences #将分词的结果存储 def writer_result(filepath,sentence): with codecs.open(filepath,'a',encoding='utf8') as w: w.write(sentence) w.close() #读分词词典,词典中最长词长度 def get_seg_words(filepath): xl =xlrd.open_workbook(filepath) sheet = xl.sheet_by_index(0) words = sheet.col_values(1,1) max_index = 0 word_dir = set() for word in words: word_dir.add(word) if len(word)>max_index: max_index = len(word) return word_dir,max_index #读取停用词词典 def get_stop_words(filepath): xl = xlrd.open_workbook(filepath) sheet = xl.sheet_by_index(0) words = sheet.col_values(1, 1) stop_words = set() for word in words: stop_words.add(word) return stop_words paths_set = eachFile('分词文本') seg_words,max_index = get_seg_words(r'词表/words.xlsx') stop_words = get_stop_words(r'词表/stopwords.xlsx') for path in paths_set: print('begain : %s'%path) sentences = readtxt('分词文本'+path) for sentence in sentences: sentence = sentence.strip() start_index = 1 end_index = len(sentence) result_sentence='' while start_index>0: for start_index in range(max(end_index-max_index,0),end_index,1): #print(sentence[start_index:end_index]) if sentence[start_index:end_index] in stop_words: break elif sentence[start_index:end_index] in seg_words or end_index == start_index+1: str = sentence[start_index:end_index] result_sentence=str+'/'+result_sentence break end_index = start_index writer_result('result'+path,result_sentence) writer_result('result'+path,'\r\n')