from zhon.hanzi import punctuation as punct_zh
from string import punctuation as punct_eng
def clean_str(str):
cstr = re.sub(r'[\u3000\u0020\t\n]+', '', str)
cstr = re.sub(r"[%s]+" %punct_zh, "",cstr)
cstr = re.sub(r"[%s]+" %punct_eng, "",cstr)
return cstr
调用方法:
cline = clean_str(line)