# 导入相关库，读取数据

# Import our libraries
import pandas as pd
import numpy as np
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
import matplotlib.pyplot as plt
from sklearn.svm import SVC
import seaborn as sns
sns.set(style="ticks")

import check_file as ch

%matplotlib inline

# Read in our dataset

# Take a look at the first few rows of the dataset

PregnanciesGlucoseBloodPressureSkinThicknessInsulinBMIDiabetesPedigreeFunctionAgeOutcome
061487235033.60.627501
11856629026.60.351310
28183640023.30.672321
318966239428.10.167210
40137403516843.12.288331

# Cells for work
diabetes.describe()

PregnanciesGlucoseBloodPressureSkinThicknessInsulinBMIDiabetesPedigreeFunctionAgeOutcome
count768.000000768.000000768.000000768.000000768.000000768.000000768.000000768.000000768.000000
mean3.845052120.89453169.10546920.53645879.79947931.9925780.47187633.2408850.348958
std3.36957831.97261819.35580715.952218115.2440027.8841600.33132911.7602320.476951
min0.0000000.0000000.0000000.0000000.0000000.0000000.07800021.0000000.000000
25%1.00000099.00000062.0000000.0000000.00000027.3000000.24375024.0000000.000000
50%3.000000117.00000072.00000023.00000030.50000032.0000000.37250029.0000000.000000
75%6.000000140.25000080.00000032.000000127.25000036.6000000.62625041.0000001.000000
max17.000000199.000000122.00000099.000000846.00000067.1000002.42000081.0000001.000000

# 1.sns.pairplot

http://seaborn.pydata.org/generated/seaborn.pairplot.html

#可以看到对角线上是各个属性的直方图（分布图），而非对角线上是两个不同属性之间的相关图
sns.pairplot(diabetes)
plt.savefig("./pairplot000.png")


'''
hue ：针对某一字段进行分类
sns.pairplot(diabetes, hue="Outcome");

kind：用于控制非对角线上的图的类型，可选"scatter"与"reg"
diag_kind：控制对角线上的图的类型，可选"hist"与"kde"
sns.pairplot(diabetes,kind="reg",diag_kind="kde")

palette：控制色调
sns.pairplot(data,hue="种类",palette="husl")

#markers：控制散点的样式
sns.pairplot(data,hue="Outcome",markers=["+", "s", "D"])

#单独用vars参数选择"萼片长 "和"花瓣长"两种属性
sns.pairplot(data,vars=["Pregnancies","Glucose"])

# 用x_vars和 y_vars参数指定
# 需要注意的是，x_vars和y_vars要同时指定
sns.pairplot(data,x_vars=["Pregnancies","Glucose"],
y_vars=["BloodPressure","SkinThickness"])

'''

sns.pairplot(diabetes, hue="Outcome");
plt.savefig("./pairplot01.png")


# 2.heatmap热度图

'''
#heatmap热度图，seaborn中常用的图

#重要点思维：拿到一批数据一般会求特征之间的相关系数，可以用pands直接求出来相关系数，放到heatmap，

'''
sns.heatmap(diabetes.corr(), annot=True, cmap="YlGnBu");
plt.savefig("./heatmap.png")


# 3.hist()直方图

diabetes.hist()
plt.savefig("./hist.png")


# Possible keys for the dictionary
a = '0.65'
b = '0'
c = 'Age'
d = '0.35'
e = 'Glucose'
f = '0.5'
g = "More than zero"

# Fill in the dictionary with the correct values here
'The proportion of diabetes outcomes in the dataset':d,
'The number of missing data points in the dataset': b,
'A dataset with a symmetric distribution': e,
'A dataset with a right-skewed distribution': c,
'This variable has the strongest correlation with the outcome': e
}

# Just to check your answer, don't change this

Awesome! These all look great!


y = diabetes['Outcome']
X = diabetes[['Pregnancies','Glucose', 'BloodPressure', 'SkinThickness','Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


### 用随机搜索算法

# build a classifier
clf_rf = RandomForestClassifier()

# Set up the hyperparameter search
param_dist = {"max_depth": [3, None],
"n_estimators": list(range(10, 200)),
"max_features": list(range(1, X_test.shape[1]+1)),
"min_samples_split": list(range(2, 11)),
"min_samples_leaf": list(range(1, 11)),
"bootstrap": [True, False],
"criterion": ["gini", "entropy"]}

# Run a randomized search over the hyperparameters
random_search = RandomizedSearchCV(clf_rf, param_distributions=param_dist)

# Fit the model on the training data
random_search.fit(X_train, y_train)

# Make predictions on the test data
rf_preds = random_search.best_estimator_.predict(X_test)

ch.print_metrics(y_test, rf_preds, 'random forest')

Accuracy score for random forest : 0.7597402597402597
Precision score random forest : 0.65
Recall score random forest : 0.7090909090909091
F1 score random forest : 0.6782608695652174


# build a classifier for ada boost

# Set up the hyperparameter search
# look at  setting up your search for n_estimators, learning_rate
param_dist = {"n_estimators": [10, 100, 200, 400],
"learning_rate": [0.001, 0.005, .01, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 1, 2, 10, 20]}

# Run a randomized search over the hyperparameters

# Fit the model on the training data

# Make predictions on the test data

# Return your metrics on test data

Accuracy score for adaboost : 0.7792207792207793
Precision score adaboost : 0.7441860465116279
Recall score adaboost : 0.5818181818181818
F1 score adaboost : 0.6530612244897959

# build a classifier for support vector machines
clf_svc = SVC()

# Set up the hyperparameter search
# look at setting up your search for C (recommend 0-10 range),
# kernel, and degree
# http://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html
param_dist = {"C": [0.1, 0.5, 1, 3, 5],
"kernel": ['linear','rbf']
}

# Run a randomized search over the hyperparameters
svc_search = RandomizedSearchCV(clf_svc, param_distributions=param_dist)

# Fit the model on the training data
svc_search.fit(X_train, y_train)

# Make predictions on the test data
svc_preds = svc_search.best_estimator_.predict(X_test)

# Return your metrics on test data
ch.print_metrics(y_test, svc_preds, 'svc')

Accuracy score for svc : 0.7532467532467533
Precision score svc : 0.6545454545454545
Recall score svc : 0.6545454545454545
F1 score svc : 0.6545454545454545


a = 'randomforest'
c = 'supportvector'

best_model =  b# put your best model here as a string or variable

# See if your best model was also mine.
# Notice these might not match depending your search!
ch.check_best(best_model)

Nice!  It looks like your best model matches the best model I found as well!  It makes sense to use f1 score to determine best in this case given the imbalance of classes.  There might be justification for precision or recall being the best metric to use as well - precision showed to be best with adaboost again.  With recall, SVMs proved to be the best for our models.


# 4.特征重要性feature_importances_

print(diabetes.shape)

(768, 9)

# Show your work here - the plot below was helpful for me
# https://stackoverflow.com/questions/44101458/random-forest-feature-importance-chart-using-python
features = diabetes.columns[:diabetes.shape[1]]
importances = random_search.best_estimator_.feature_importances_
indices = np.argsort(importances)

plt.title('Feature Importances')
plt.barh(range(len(indices)), importances[indices], color='b', align='center')
plt.yticks(range(len(indices)), features[indices])
plt.xlabel('Relative Importance');

plt.savefig("./importances.png")


For the most part these match what I would expect based on the correlations we saw earlier between each variable and Outcome. However, one interesting finding is that pregnancy looked to be correlated with the Outcome in the exploratory findings. Though, that is likely only because pregnancy is correlated with age, and age is truly the variable that is a better indicator of diabetes.

# Check your solution by matching the correct values in the dictionary
# and running this cell
a = 'Age'
b = 'BloodPressure'
c = 'BMI'
d = 'DiabetesPedigreeFunction'
e = 'Insulin'
f = 'Glucose'
g = 'Pregnancy'
h = 'SkinThickness'

sol_seven = {
'The variable that is most related to the outcome of diabetes' : f,
'The second most related variable to the outcome of diabetes' : c,
'The third most related variable to the outcome of diabetes' : a,
'The fourth most related variable to the outcome of diabetes' : d
}

ch.check_q_seven(sol_seven)

That's right!  Some of these were expected, but some were a bit unexpected too!


In this case study, we looked at predicting diabetes for 768 patients. There was a reasonable amount of class imbalance with just under 35% of patients having diabetes. There were no missing data, and initial looks at the data showed it would be difficult to separate patients with diabetes from those that did not have diabetes.

Three advanced modeling techniques were used to predict whether or not a patient has diabetes. The most successful of these techniques proved to be an AdaBoost Classification technique, which had the following metrics:

Accuracy score for adaboost : 0.7792207792207793

Precision score adaboost : 0.7560975609756098

Recall score adaboost : 0.5636363636363636

F1 score adaboost : 0.6458333333333333

Based on the initial look at the data, it is unsurprising that Glucose, BMI, and Age were important in understanding if a patient has diabetes. These were consistent with more sophisticated approaches. Interesting findings were that pregnancy looked to be correlated when initially looking at the data. However, this was likely due to its large correlation with age.

