import csv
import numpy as np
import matplotlib.pyplot as plt
import copy
from time import sleep
import random
import types
def loadDataset(filename):
with open(filename, 'r') as f:
lines = csv.reader(f)
data_set = list(lines)
if filename != 'titanic.csv':
for i in range(len(data_set)):
del(data_set[i][0])
# 整理数据
for i in range(len(data_set)):
del(data_set[i][0])
del(data_set[i][2])
data_set[i][4] += data_set[i][5]
del(data_set[i][5])
del(data_set[i][5])
del(data_set[i][6])
del(data_set[i][-1])
category = data_set[0]
del (data_set[0])
# 转换数据格式
for data in data_set:
data[0] = int(data[0])
data[1] = int(data[1])
if data[3] != '':
data[3] = float(data[3])
else:
data[3] = None
data[4] = float(data[4])
data[5] = float(data[5])
# 补全缺失值 转换记录方式 分类
for data in data_set:
if data[3] is None:
data[3] = 28
# male : 1, female : 0
if data[2] == 'male':
data[2] = 1
else:
data[2] = 0
# 经过测试,如果不将数据进行以下处理,分布会过于密集,处理后,数据的分布变得稀疏了
# age <25 为0, 25<=age<31为1,age>=31为2