# -*- coding: utf-8 -*-
"""
Spyder Editor
This is a temporary script file.
"""
# NGram类计算字母序列出现的频率
class NGram(object):
# # text 一个unicode的文本输入
# n作为定义字符序列的长度
def __init__(self, text, n=3):
self.length = None
self.n = n
self.table = {}
self.parse_text(text)
self.calculate_length()
# 得到3-gram 的词典
def parse_text(self, text):
chars = ' ' * self.n # initial sequence of spaces with length n
for letter in (" ".join(text.split()) + " "):
chars = chars[1:] + letter # append letter to sequence of length n
self.table[chars] = self.table.get(chars, 0) + 1 # increment count
# NGram对象的长度
def calculate_length(self):
""" Treat the N-Gram table as a vector and return its scalar magnitude
to be used for performing a vector-based search.
"""
self.length = sum([x * x for x in self.table.values()]) ** 0
nlp(一)语种检测
最新推荐文章于 2022-07-08 15:09:29 发布