def add_endings(text):
"""将文章分句并加句子结束符</s>,返回带有</s>的文章(字符串类型)"""
sentences = nltk.sent_tokenize( text )
result_text = """"""
for sentence in sentences:
sentence = "<s> " + sentence + " </s> "
result_text += sentence
return result_text
def preprocess(text):
"""传入文章原文,进行简单的预处理后,返回tokens列表"""
text = text.lower() # 小写
text = add_endings( text ) # 分句并加句子开始结束符
# print( "经过预处理的文章:\n", text )
pattern = re.compile( r"[-\[\]()\t\n.,;!?“”‘'\\`~\s]+" )
tokens = pattern.split( text )
return tokens
def unigrams(tokens):
"""传入规范化以及分词后的符记序列,并以字典的形式返回一元语法模型,其中键具有唯一性,值则代表了该一元语法在语料库中出现的次数"""
unigram = {}
for word in tokens:
if word not in unigram:
unigram[word] = 1
else:
unigram[word] = unigram[word] + 1
return unigram
def bigrams(tokens):
"""传入规范化以及分词后的符记序列,并以字典的形式返回二元语法模型,其中键具有唯一性,值则代表了该二元语法在语料库中出现的次数"""
bigram = {}
# 每两个相邻单词截取为一个二元语法
bi_grammar = ''
for index, word in enumerate( tokens ): # 同时枚举出符记的下标和值
# print( index, word )
if word != "</s>" and word != "":
bi_grammar = word + " " + tokens[index + 1]
# print( bi_grammar )
if bi_grammar not in bigram:
bigram[bi_grammar] = 1
else:
bigram[bi_grammar] = bigram[bi_grammar] + 1
return bigram
NLP:Python 实现unigram(一元组)和bigram(二元组)
于 2022-07-05 10:47:04 首次发布