NLP中unicode文本处理
Unicode字符标转化
将unicode文件转化为ascii。
import unicodedata
def unicode_to_ascii(s):
return ''.join(c for c in unicodedata.normalize('NFD', s))
在标点与单词之间加入空格
def preprocess_sentence(s):
import re
# 需要处理的标点
punctuation = ',.?!'
pre_process_s = re.sub(r"([{}])".format(punctuation), r" \1 ", s)
# 将连续的多个空格合并为一个空格
pre_process_s = re.sub(r"[ ]+", r" ", pre_process_s)
# 去除sentence两边的空格
pre_process_s = pre_process_s.strip()
return pre_process_s
preprocess_sentence('Hello, Bob!')
# 结果'Hello , Bob !'