# 中文分词 def fen_ci(string, cut_all=False, append_tag=False, filter_set=[]): seq = [] if append_tag: seq.append('start') for ws in jieba.cut(string, cut_all=cut_all): if ws not in filter_set: if Chinese(ws): for ch in ws: seq.append(ch) else: seq.append(ws) if append_tag: seq.append('end') return seq def Chinese(str): if str >= '\u4e00' and str<= '\u9fa5': return True else: return False
文本相似度--中文分词--拆成单个字
最新推荐文章于 2020-12-30 16:27:16 发布