1.数据集清洗
(load_one_file()这里需要注意一下,系统默认utf-8编码,但是"utf-8"以及"gbk"编码数据集文件都会报错,所以使用encoding="cp852")
def load_one_file(filename):
x=""
with open(filename,encoding="cp852") as f:
for line in f:
line = line.strip()
line=line.strip('\n')
line = line.strip('\r')
x+=line
return x
def load_files_from_dir(rootdir):
x=[]
list = os.listdir(rootdir)
for i in range(0, len(list)):
path = os.path.join(rootdir, list[i])
if os.path.isfile(path):
v=load_one_file(path)
x.append(v)
return x
def load_all_files():
ham=[]
spam=[]
path="enron1/ham/"
print("Load %s" % path)
ham+=load_files_from_dir(path)
path="enron1/spam/"
print("Load %s"