美国工资水平预测:
adults = pd.read_csv('../data/adults.txt')
adults.head()
data = adults[['age', 'workclass', 'education', 'marital_status', 'occupation', 'race', 'sex', 'hours_per_week', 'native_country']].copy()
target = adults.salary
cols = [ 'workclass', 'education', 'marital_status', 'occupation', 'race', 'sex', 'native_country']
for col in cols:
unique = data[col].unique()
def convert(item):
return np.argwhere(unique==item)[0,0]
data[col] = data[col].map(convert)
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=1) # 通过seed控制是否是伪随机
np.random.seed(2)
np.random.rand(1)
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
knn.score(X_train, y_train)
y_ = knn.predict(X_test)
knn.score(X_test, y_test)
pd.crosstab(index=y_test, columns=y_, rownames=['真实值'], colnames=['预测值'], margins=True)
上述结果精度不够,因为数据没有标准化
改进:
定义标准化函数:
def normalized(x):
return (x - x.min()) / (x.max() - x.min())
for col in data.columns:
data[col] = data[col].transform(normalized)
再进行预测:
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=1)
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
knn.score(X_train, y_train)
y_ = knn.predict(X_test)
knn.score(X_test, y_test)