记录英文文本中的文本清洗内容:
- 缩略词更改
- 拼写校正
- 标点符号
- 符号替换
- 去除空格
def clean_text(text):
"""
Clean text
:param text: the string of text
:return: text string after cleaning
"""
# acronym
text = re.sub(r"can\'t", "can not", text)
text = re.sub(r"cannot", "can not ", text)
text = re.sub(r"what\'s", "what is", text)
text = re.sub(r"What\'s", "what is", text)
text = re.sub(r"\'ve ", " have ", text)
text = re.sub(r"n\'t", " not ", text)
text = re.sub(r"i\'m", "i am ", text)
text = re.sub(r"I\'m", "i am ", text)
text = re.sub(r"\'re", " are ", text)
text = re.sub(r"\'d", " would ", text)
text = re.sub(r"\'ll",