例一
from string import punctuation
from string import digits
import re
def preprocess_English(text,rm_list):
text = re.sub(r'[{}]+'.format(punctuation+digits), '', text)
for rm_item in rm_list:
text = text.replace(rm_item, '')
return text
rm_list = ['pg','\n','\t'] #string you want to remove from text,'\n' and '\t' must be include
text_file='LifeofEdwinForrest.txt'
with open(text_file,'r',encoding='utf-8') as f:
text = f.read()
text = text.lower()
print(preprocess_English(text,rm_list))
例二
def preprocess_Chinese(text):
from zhon.hanzi import punctuation
text = re.sub(r'[{}]+'.format(punctuation),'',text)
return text
def preprocess_English(text):
from string import punctuation
text = re.sub(r'[{}]+'.format(punctuation),'',text)
return text
参考:
[1] NLP:最全去掉文本中的中英文标点符号大法 [CSDN]