主要针对连续重复字符串 (repetition)
比如:
“I love my cat cat cat cat cat”
“I love you. I love you. I love you. I love you. ”
还有其他一些后处理。
def replace_regex(x, remove=None):
# 小写独立i 转 I
x = re.sub(r"\bi\b","I",x) # i -> I
# 连续重复字符(原文中以空格分隔 如 "p p p p p","* * * * *")只保留第一个
x = re.sub(r"(\b\w|\W)(\s+\1){1,}", "\g<1>", x)
# 去除可能产生的多余连续空格
x = re.sub(r"\s{1,}"," ", x)
# 连续重复单词(原文中以空格分隔 如 "cat cat cat cat cat")只保留第一个
x = re.sub(r'\b(\w+)(\s+\1){1,}',"\g<1>",x) # repeating words
x = re.sub(r"\s{1,}"," ", x) # delete redundant space
# 连续重复句子(这里需要不断去重,假设一句话重复了n次,需要替换log(n)次
tmp = re.sub(r'(.+[^\s])(\s+\1){1,}',"\g<1>",x)
while (len(x)>len(tmp)):
x = tmp
tmp = re.sub(r'\b(.+[^\s])(\s+\1){1,}',"\g<1>",x)
# delete too long words
x = re.sub(r"(\w){20,}","",x) # 一个单词超过20个字符可以去掉
x = x.replace('_',"") # 一些可能出现的不要的字符
x = re.sub(r"\$\s(?=[0-9])",'\$',x) # $ 999 -> $999
x = re.sub(r"(?<=[a-zA-Z])\s(\'|’)\s(?=[a-zA-Z])", "'", x) # 去掉单引号两侧的空格 -》I'm ,he's
x = re.sub(r"(?<=[0-9])\s\.\s(?=[0-9])",".", x) # 小数点两侧的空格 9 . 1-> 9.1
x = re.sub(r"(?<=\w)\s([\.\?\!,,\:\:])\s", "\g<1> ", x) #句子末标点符号与前一句之间的空格去掉: end of sentence . New sentence-> end of sentence. New sentence
# 每个句首字母大写
x = re.sub(r"(?:^|[\.\!\?]\s?\"?\s?)([a-z])",lambda m:m.group(0).upper(),x)
return x
附上检查正则表达式的在线平台:
https://regex101.com/r/1paXsy/1