目录
总结:train_test_split,KFold,StratifiedKFold左右都是将数据拆分。
影响薪水的因素有很多,比如教育背景,年龄,种族,工作单位等等,可以用KNN进行分类。
导包
import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
# cv int 6 数据分分成6份
from sklearn.model_selection import cross_val_score,GridSearchCV
# KFold、StratifiedKFold将数据分成多少份
from sklearn.model_selection import KFold,StratifiedKFold
数据和目标值
data = np.random.randint(0,10,size = (8,2))
target = np.array([0,0,1,0,1,1,1,0])
display(data,target)
train_test_split(data,target)
数据是随机的
kFold = KFold(n_splits=4)
# train,test是索引,只要有索引可以获取数据
for train,test in kFold.split(data,target):
print(target[train],target[test])
数据是按比列的
# 分成4份,每一份数据特征,数据样本比例和原来一样的
sKFold = StratifiedKFold(n_splits=4)
for train,test in sKFold.split(data,target):
print(target[train],target[test])
总结:train_test_split,KFold,StratifiedKFold左右都是将数据拆分。
data = pd.read_csv('./salary.txt')
data.head()
属性
data.columns
删除无用数据
data.drop(labels=['final_weight','education','capital_gain','capital_loss'],
axis = 1,
inplace=True)
数据结构
data.shape
data.head()
算法进行
X = data.iloc[:,0:-1]
y = data['salary']
knn = KNeighborsClassifier()
knn.fit(X,y)
将数据中str转换int,float从而算法可以计算(map方法,apply方法,transform方法)
u = X['workclass'].unique()
np.argwhere(u == 'Local-gov')[0,0]#二维的
def convert(x):
return np.argwhere(u == x)[0,0]#获取索引
X['workclass'] = X['workclass'].map(convert)
X.head()
将剩余属性转化为数值
cols = [ 'marital_status', 'occupation', 'relationship', 'race', 'sex','native_country']
for col in cols:
u = X[col].unique()
def convert(x):
return np.argwhere(u == x)[0,0]
X[col] = X[col].map(convert)
X.head()
算法计算
knn = KNeighborsClassifier()
kFold = KFold(10)#把数据分成10份
knn = KNeighborsClassifier()
accuracy = 0
for train,test in kFold.split(X,y):
knn.fit(X.loc[train],y[train])
acc = knn.score(X.loc[test],y[test])
accuracy += acc/10
print(accuracy)#平均准确率