#tokenizer function, this will make 3 grams of each query def get_ngrams(query): tempQuery = str(query) ngrams = [] for i in range(0,len(tempQuery)-3): ngrams.append(tempQuery[i:i+3]) return ngrams #by zgd def get_ngrams_zgd(input): output = {} n = 3 for i in range(len(input) - n + 1): ngramTemp = " ".join(input[i:i + n]) if ngramTemp not in output: output[ngramTemp] = 0 output[ngramTemp] += 1 return output
python处理文本使用n-gram方法
最新推荐文章于 2022-08-08 19:54:25 发布