参考:
https://yq.aliyun.com/articles/475854?utm_content=m_42632
#load doc into memory
def load_doc(filename):
#open the file as read only
file = open(filename,mode='rt',encoding='utf-8')
#read all text
text = file.read()
#close the file
file.close()
return text
#split a loaded document into sentences
def to_pairs(doc):
lines = doc.strip().split('\n')
pairs = [line.split('\t')for line in lines]
return pairs
#clean a list of lines
def clean_pairs(lines):
cleaned = list()
#prepare regex for char filtering
re_print = re.