获取数据
data = pd.read_csv('./salary.txt')
data.head()
data.columns #获取数据的列名
data.drop(labels=['final_weight','education','capital_gain','capital_loss'],
axis=1,
inplace = True) #取消一些不必要的数据
data.shape
data.head()
X = data.iloc[:,0:-1]
y = data['salary'] ##将salary单独划分
# 方法将数据中str转换成int float从而方便计算
# map方法,apply transfrom
u = X['workclass'].unique() ##所有职业属性都被取出
u
np.argwhere(u == 'Local-gov')[0,0] #取出索引
def convert(x):
return np.argwhere(u == x)[0,0]
X['workclass'] = X['workclass'].map(convert)
X.head()
cols = ['marital_status', 'occupation', 'relationship', 'race', 'sex', 'native_country']
for col in cols:
u = X[col].unique()
def convert(x):
return np.argwhere(u == x)[0,0]
X[col] = X[col].map(convert)
X.head()
knn = KNeighborsClassifier()
kFold = KFold(10)
for train,test in kFold.split(X,y):
print(train.shape,test.shape)
knn.fit()
准确率较低 影响因素较多
knn = KNeighborsClassifier()
kFold = KFold(10)
knn = KNeighborsClassifier()
accuracy = 0
for train,test in kFold.split(X,y):
knn.fit(X.loc[train],y[train])
acc = knn.score(X.loc[test],y[test])
accuracy += acc/10
print(accuracy)
作业:
#从preprocessing 数据预处理中找一找有没有其他方法将str—int,float类型