import pandas as pd
Titanic = pd.read_csv(r'Titanic.csv')
Titanic.head()
| PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked |
---|
0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
---|
1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
---|
2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S |
---|
3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S |
---|
4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S |
---|
Titanic.drop(['PassengerId','Name','Ticket','Cabin'], axis = 1, inplace = True)
Titanic.isnull().sum(axis = 0)
Survived 0
Pclass 0
Sex 0
Age 177
SibSp 0
Parch 0
Fare 0
Embarked 2
dtype: int64
fillna_Titanic = []
for i in Titanic.Sex.unique():
update = Titanic.loc[Titanic.Sex == i,].fillna(value = {'Age': Titanic.Age[Titanic.Sex == i].mean()}, inplace = False)
fillna_Titanic.append(update)
Titanic = pd.concat(fillna_Titanic)
Titanic.fillna(value = {'Embarked':Titanic.Embarked.mode()[0]}, inplace=True)
Titanic.head()
| Survived | Pclass | Sex | Age | SibSp | Parch | Fare | Embarked |
---|
0 | 0 | 3 | male | 22.000000 | 1 | 0 | 7.2500 | S |
---|
4 | 0 | 3 | male | 35.000000 | 0 | 0 | 8.0500 | S |
---|
5 | 0 | 3 | male | 30.726645 | 0 | 0 | 8.4583 | Q |
---|
6 | 0 | 1 | male | 54.000000 | 0 | 0 | 51.8625 | S |
---|
7 | 0 | 3 | male | 2.000000 | 3 | 1 | 21.0750 | S |
---|
Titanic.Pclass = Titanic.Pclass.astype('category')
dummy = pd.get_dummies(Titanic[['Sex','Embarked','Pclass']])
Titanic = pd.concat([Titanic,dummy], axis = 1)
Titanic.drop(['Sex','Embarked','Pclass'], inplace=True, axis = 1)
Titanic.head()
| Survived | Age | SibSp | Parch | Fare | Sex_female | Sex_male | Embarked_C | Embarked_Q | Embarked_S | Pclass_1 | Pclass_2 | Pclass_3 |
---|
0 | 0 | 22.000000 | 1 | 0 | 7.2500 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 |
---|
4 | 0 | 35.000000 | 0 | 0 | 8.0500 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 |
---|
5 | 0 | 30.726645 | 0 | 0 | 8.4583 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 1 |
---|
6 | 0 | 54.000000 | 0 | 0 | 51.8625 | 0 | 1 | 0 | 0 | 1 | 1 | 0 | 0 |
---|
7 | 0 | 2.000000 | 3 | 1 | 21.0750 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 |
---|
from sklearn import model_selection
predictors = Titanic.columns[1:]
X_train, X_test, y_train, y_test = model_selection.train_test_split(Titanic[predictors], Titanic.Survived,
test_size = 0.25, random_state = 1234)
from sklearn.model_selection import GridSearchCV
from sklearn import tree
max_depth = [2,3,4,5,6]
min_samples_split = [2,4,6,8]
min_samples_leaf = [2,4,8,10,12]
parameters = {'max_depth':max_depth, 'min_samples_split':min_samples_split, 'min_samples_leaf':min_samples_leaf}
grid_dtcateg = GridSearchCV(estimator = tree.DecisionTreeClassifier(), param_grid = parameters, cv=10)
grid_dtcateg.fit(X_train, y_train)
grid_dtcateg.best_params_
{'max_depth': 3, 'min_samples_leaf': 4, 'min_samples_split': 2}
from sklearn import metrics
CART_Class = tree.DecisionTreeClassifier(max_depth=3, min_samples_leaf = 4, min_samples_split=2)
decision_tree = CART_Class.fit(X_train, y_train)
pred = CART_Class.predict(X_test)
print('模型在测试集的预测准确率:\n',metrics.accuracy_score(y_test, pred))
模型在测试集的预测准确率:
0.8295964125560538
import matplotlib.pyplot as plt
y_score = CART_Class.predict_proba(X_test)[:,1]
fpr,tpr,threshold = metrics.roc_curve(y_test, y_score)
roc_auc = metrics.auc(fpr,tpr)
plt.stackplot(fpr, tpr, color='steelblue', alpha = 0.5, edgecolor = 'black')
plt.plot(fpr, tpr, color='black', lw = 1)
plt.plot([0,1],[0,1], color = 'red', linestyle = '--')
plt.text(0.5,0.3,'ROC curve (area = %0.2f)' % roc_auc)
plt.xlabel('1-Specificity')
plt.ylabel('Sensitivity')
plt.show()
<Figure size 640x480 with 1 Axes>
from sklearn.tree import export_graphviz
from IPython.display import Image
import pydotplus
from sklearn.externals.six import StringIO
dot_data = StringIO()
export_graphviz(
decision_tree,
out_file=dot_data,
feature_names=predictors,
class_names=['Unsurvived','Survived'],
rounded=True,
special_characters=True
)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
Image(graph.create_png())
---------------------------------------------------------------------------
InvocationException Traceback (most recent call last)
<ipython-input-10-35271d1803f9> in <module>()
20 # 决策树展现
21 graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
---> 22 Image(graph.create_png())
~\Anaconda3\lib\site-packages\pydotplus\graphviz.py in <lambda>(f, prog)
1795 self.__setattr__(
1796 'create_' + frmt,
-> 1797 lambda f=frmt, prog=self.prog: self.create(format=f, prog=prog)
1798 )
1799 f = self.__dict__['create_' + frmt]
~\Anaconda3\lib\site-packages\pydotplus\graphviz.py in create(self, prog, format)
1958 if self.progs is None:
1959 raise InvocationException(
-> 1960 'GraphViz\'s executables not found')
1961
1962 if prog not in self.progs:
InvocationException: GraphViz's executables not found
from sklearn import ensemble
RF_class = ensemble.RandomForestClassifier(n_estimators=200, random_state=1234)
RF_class.fit(X_train, y_train)
RFclass_pred = RF_class.predict(X_test)
print('模型在测试集的预测准确率:\n',metrics.accuracy_score(y_test, RFclass_pred))
y_score = RF_class.predict_proba(X_test)[:,1]
fpr,tpr,threshold = metrics.roc_curve(y_test, y_score)
roc_auc = metrics.auc(fpr,tpr)
plt.stackplot(fpr, tpr, color='steelblue', alpha = 0.5, edgecolor = 'black')
plt.plot(fpr, tpr, color='black', lw = 1)
plt.plot([0,1],[0,1], color = 'red', linestyle = '--')
plt.text(0.5,0.3,'ROC curve (area = %0.2f)' % roc_auc)
plt.xlabel('1-Specificity')
plt.ylabel('Sensitivity')
plt.show()
importance = RF_class.feature_importances_
Impt_Series = pd.Series(importance, index = X_train.columns)
Impt_Series.sort_values(ascending = True).plot('barh')
plt.show()
NHANES = pd.read_excel(r'C:\Users\Administrator\Desktop\NHANES.xlsx')
NHANES.head()
print(NHANES.shape)
predictors = NHANES.columns[:-1]
X_train, X_test, y_train, y_test = model_selection.train_test_split(NHANES[predictors], NHANES.CKD_epi_eGFR,
test_size = 0.25, random_state = 1234)
max_depth = [18,19,20,21,22]
min_samples_split = [2,4,6,8]
min_samples_leaf = [2,4,8]
parameters = {'max_depth':max_depth, 'min_samples_split':min_samples_split, 'min_samples_leaf':min_samples_leaf}
grid_dtreg = GridSearchCV(estimator = tree.DecisionTreeRegressor(), param_grid = parameters, cv=10)
grid_dtreg.fit(X_train, y_train)
grid_dtreg.best_params_
CART_Reg = tree.DecisionTreeRegressor(max_depth = 20, min_samples_leaf = 2, min_samples_split = 4)
CART_Reg.fit(X_train, y_train)
pred = CART_Reg.predict(X_test)
metrics.mean_squared_error(y_test, pred)
RF = ensemble.RandomForestRegressor(n_estimators=200, random_state=1234)
RF.fit(X_train, y_train)
RF_pred = RF.predict(X_test)
metrics.mean_squared_error(y_test, RF_pred)
importance = pd.Series(RF.feature_importances_, index = X_train.columns)
importance.sort_values().plot('barh')
plt.show()