In [3]:
# 为这个项目导入需要的库
import numpy as np
import pandas as pd
from time import time
from IPython.display import display # 允许为DataFrame使用display()
# 导入附加的可视化代码visuals.py
import visuals as vs
# 为notebook提供更加漂亮的可视化
%matplotlib inline
# 导入人口普查数据
data = pd.read_csv("census.csv")
# 成功 - 显示第一条记录
display(data.head(n=1))
In [4]:
# TODO:总的记录数
n_records = data.shape[0]
# TODO:被调查者的收入大于$50,000的人数
n_greater_50k = data[data.income.str.contains('>50K')].shape[0]
# TODO:被调查者的收入最多为$50,000的人数
n_at_most_50k = data[data.income.str.contains('<=50K')].shape[0]
# TODO:被调查者收入大于$50,000所占的比例
greater_percent = np.divide(n_greater_50k, float(n_records)) * 100
# 打印结果
print "Total number of records: {}".format(n_records)
print "Individuals making more than $50,000: {}".format(n_greater_50k)
print "Individuals making at most $50,000: {}".format(n_at_most_50k)
print "Percentage of individuals making more than $50,000: {:.2f}%".format(greater_percent)
In [5]:
# 将数据切分成特征和对应的标签
income_raw = data['income']
features_raw = data.drop('income', axis = 1)
# 可视化原来数据的倾斜的连续特征
vs.distribution(data)
In [6]:
# 对于倾斜的数据使用Log转换
skewed = ['capital-gain', 'capital-loss']
features_raw[skewed] = data[skewed].apply(lambda x: np.log(x + 1))
# 可视化经过log之后的数据分布
vs.distribution(features_raw, transformed = True)
In [7]:
# 导入sklearn.preprocessing.StandardScaler
from sklearn.preprocessing import MinMaxScaler
# 初始化一个 scaler,并将它施加到特征上
scaler = MinMaxScaler()
numerical = ['age', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']
features_raw[numerical] = scaler.fit_transform(data[numerical])
# 显示一个经过缩放的样例记录
display(features_raw.head(n = 1))
In [8]:
# TODO:使用pandas.get_dummies()对'features_raw'数据进行独热编码
features = pd.get_dummies(features_raw)
# TODO:将'income_raw'编码成数字值
income = income_raw.replace(['>50K', '<=50K'], [1, 0])
# 打印经过独热编码之后的特征数量
encoded = list(features.columns)
print "{} total features after one-hot encoding.".format(len(encoded))
# 移除下面一行的注释以观察编码的特征名字
# print encoded
In [9]:
# 导入 train_test_split
from sklearn.model_selection import train_test_split
# 将'features'和'income'数据切分成训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(features, income, test_size = 0.2, random_state = 0)
# 显示切分的结果
print "Training set has {} samples.".format(X_train.shape[0])
print "Testing set has {} samples.".format(X_test.shape[0])
In [10]:
# TODO: 计算准确率
accuracy = np.divide(n_greater_50k, float(n_records))
# TODO: 使用上面的公式,并设置beta=0.5计算F-score
recall = np.divide(n_greater_50k, n_greater_50k)
precision = np.divide(n_greater_50k, float(n_records))
fscore = (1 + np.power(0.5, 2)) * np.multiply(precision, recall) / (np.power(0.5, 2) * precision + recall)
# 打印结果
print "Naive Predictor: [Accuracy score: {:.4f}, F-score: {:.4f}]".format(accuracy, fscore)
In [11]:
# TODO:从sklearn中导入两个评价指标 - fbeta_score和accuracy_score
from sklearn.metrics import fbeta_score, accuracy_score
def train_predict(learner, sample_size, X_train, y_train, X_test, y_test):
'''
inputs:
- learner: the learning algorithm to be trained and predicted on
- sample_size: the size of samples (number) to be drawn from training set
- X_train: features training set
- y_train: income training set
- X_test: features testing set
- y_test: income testing set
'''
results = {}
# TODO:使用sample_size大小的训练数据来拟合学习器
# TODO: Fit the learner to the training data using slicing with 'sample_size'
start = time() # 获得程序开始时间
learner = learner.fit(X_train[: sample_size], y_train[: sample_size])
end = time() # 获得程序结束时间
# TODO:计算训练时间
results['train_time'] = end - start
# TODO: 得到在测试集上的预测值
# 然后得到对前300个训练数据的预测结果
start = time() # 获得程序开始时间
predictions_test = learner.predict(X_test)
predictions_train = learner.predict(X_train[: 300])
end = time() # 获得程序结束时间
# TODO:计算预测用时
results['pred_time'] = end - start
# TODO:计算在最前面的300个训练数据的准确率
results['acc_train'] = accuracy_score(y_train[: 300], predictions_train)
# TODO:计算在测试集上的准确率
results['acc_test'] = accuracy_score(y_test, predictions_test)
# TODO:计算在最前面300个训练数据上的F-score
results['f_train'] = fbeta_score(y_train[: 300], predictions_train, beta=0.5)
# TODO:计算测试集上的F-score
results['f_test'] = fbeta_score(y_test, predictions_test, beta=0.5)
# 成功
print "{} trained on {} samples.".format(learner.__class__.__name__, sample_size)
# 返回结果
return results
In [12]:
# TODO:从sklearn中导入三个监督学习模型
from sklearn import tree, svm, ensemble
# TODO:初始化三个模型
clf_A = tree.DecisionTreeClassifier()
clf_B = svm.SVC()
clf_C = ensemble.AdaBoostClassifier()
# TODO:计算1%, 10%, 100%的训练数据分别对应多少点
samples_1 = int(X_train.shape[0] * 0.01)
samples_10 = int(X_train.shape[0] * 0.1)
samples_100 = int(X_train.shape[0] * 1)
print [samples_1, samples_10, samples_100]
# 收集学习器的结果
results = {}
for clf in [clf_A, clf_B, clf_C]:
clf_name = clf.__class__.__name__
results[clf_name] = {}
for i, samples in enumerate([samples_1, samples_10, samples_100]):
results[clf_name][i] = \
train_predict(clf, samples, X_train, y_train, X_test, y_test)
# 对选择的三个模型得到的评价结果进行可视化
vs.evaluate(results, accuracy, fscore)
In [21]:
# TODO:导入'GridSearchCV', 'make_scorer'和其他一些需要的库
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.metrics import make_scorer
from sklearn.ensemble import AdaBoostClassifier
# TODO:初始化分类器
clf = AdaBoostClassifier(random_state=0)
# TODO:创建你希望调节的参数列表
parameters = {'n_estimators': [50, 100, 200]}
# TODO:创建一个fbeta_score打分对象
scorer = make_scorer(fbeta_score, beta=0.5)
# TODO:在分类器上使用网格搜索,使用'scorer'作为评价函数
kfold = KFold(n_splits=10)
grid_obj = GridSearchCV(clf, parameters, scorer, cv=kfold)
# TODO:用训练数据拟合网格搜索对象并找到最佳参数
grid_fit = grid_obj.fit(X_train, y_train)
# 得到estimator
best_clf = grid_obj.best_estimator_
# 使用没有调优的模型做预测
predictions = (clf.fit(X_train, y_train)).predict(X_test)
best_predictions = best_clf.predict(X_test)
# 汇报调参前和调参后的分数
print "Unoptimized model\n------"
print "Accuracy score on testing data: {:.4f}".format(accuracy_score(y_test, predictions))
print "F-score on testing data: {:.4f}".format(fbeta_score(y_test, predictions, beta = 0.5))
print "\nOptimized Model\n------"
print "Final accuracy score on the testing data: {:.4f}".format(accuracy_score(y_test, best_predictions))
print "Final F-score on the testing data: {:.4f}".format(fbeta_score(y_test, best_predictions, beta = 0.5))
In [23]:
# TODO:导入一个有'feature_importances_'的监督学习模型
from sklearn.ensemble import RandomForestClassifier
# TODO:在训练集上训练一个监督学习模型
model = RandomForestClassifier(random_state=0)
model.fit(X_train, y_train)
# TODO: 提取特征重要性
importances = model.feature_importances_
importances_AdaBoost = best_clf.feature_importances_
# 绘图
vs.feature_plot(importances, X_train, y_train)
vs.feature_plot(importances_AdaBoost, X_train, y_train)
In [25]:
# 导入克隆模型的功能
from sklearn.base import clone
# 减小特征空间
X_train_reduced = X_train[X_train.columns.values[(np.argsort(importances_AdaBoost)[::-1])[:5]]]
X_test_reduced = X_test[X_test.columns.values[(np.argsort(importances_AdaBoost)[::-1])[:5]]]
# 在前面的网格搜索的基础上训练一个“最好的”模型
clf = (clone(best_clf)).fit(X_train_reduced, y_train)
# 做一个新的预测
reduced_predictions = clf.predict(X_test_reduced)
# 对于每一个版本的数据汇报最终模型的分数
print "Final Model trained on full data\n------"
print "Accuracy on testing data: {:.4f}".format(accuracy_score(y_test, best_predictions))
print "F-score on testing data: {:.4f}".format(fbeta_score(y_test, best_predictions, beta = 0.5))
print "\nFinal Model trained on reduced data\n------"
print "Accuracy on testing data: {:.4f}".format(accuracy_score(y_test, reduced_predictions))
print "F-score on testing data: {:.4f}".format(fbeta_score(y_test, reduced_predictions, beta = 0.5))