# 将文本中的特殊字符替换成空格,然后全部变为小写
def clean_text(text_string, special_characters):
cleaned_string = text_string
for string in special_characters:
cleaned_string = cleaned_string.replace(string, "")
cleaned_string = cleaned_string.lower()
return(cleaned_string)
# 分词函数根据空格进行分词,clean是个待选参数,若为True表示需要替换掉特殊字符
def tokenize(text_string, special_characters, clean=False):
cleaned_text = text_string
if clean:
cleaned_text = clean_text(text_string, special_characters)
tokens = cleaned_text.split(" ")
return(tokens)
final_misspelled_words = []
# 计算拼写错误的单词
def spell_check(vocabulary_file, text_file, special_characters=[",",".","'",";","\n"]):
misspelled_words = []
vocabulary = open(vocabulary_file).read()
text = open(text_file).read()
tokenized_vocabulary = tokenize(vocabulary, special_characters)
tokenized_text = tokenize(text, special_characters, True)
for ts in tokenized_text:
if ts not in tokenized_vocabulary and ts != '':
misspelled_words.append(ts)
return(misspelled_words)
Spell Checker
最新推荐文章于 2022-03-08 13:47:41 发布