import re def compute_ngrams(word): # BOW, EOW = ('<', '>') # Used by FastText to attach to all words as prefix and suffix pattern = r'[a-zA-Z]+' re.findall(pattern, word) extended_word,tag_dict = segword(word) # print(extended_word,tag_dict) min_n = 2 max_n = len(extended_word) ngrams = [] for ngram_length in range(min_n, min(len(extended_word), max_n) + 1): for i in range(0, len(extended_word) - ngram_length + 1): new_word = extended_word[i:i + ngram_length] new_word2 = new_word if len(new_word) == 1: continue if len(tag_dict) == 0: ngrams.append(new_word) else: for c in new_word: if c.encode('utf-8').isalpha(): new_word2 = new_word2.replace(c,tag_dict[c]+' ')
NLP之滑动窗口函数
最新推荐文章于 2024-06-19 21:53:18 发布