import pandas as pd
import numpy as np
def draw_missing_data_table(df): #得出缺失数据占总数的百分比
total=df.isnull().sum().sort_values(ascending=False)
percent=(df.isnull().sum()/df.count()).sort_values(ascending=False)
missing_data=pd.concat([total,percent],axis=1,keys=['Total','Percent'])
return missing_data
def plot_learning_curve(estimator,title,X,y,ylim=None,cv=None, n_jobs=1,train_sizes=np.linspace(.1,1.0,5)):
plt.figure()
plt.title(title)
if ylim is not None:
plt.ylim(*ylim)
plt.xlabel("Training examples")
plt.ylabel("Score")
train_sizes, train_scores, test_scores=learning_curve(estimator,X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
train_scores_mean=np.mean(train_scores, axis=1)
train_scores_std=np.std(train_scores,axis=1)
test_scores_mean=np.mean(test_scores,axis=1)
test_scores_std=np.std(test_scores,axis=1)
plt.grid()
plt.fill_between(train_sizes, train_scores_mean-train_scores_std, train_scores_mean+train_scores_std, alpha=0.1, color='r')
plt.fill_between(train_sizes, test_scores_mean-test_scores_std,test_scores_mean+test_scores_std,alpha=0.1, color="g")
plt.plot(train_sizes,train_scores_mean,'o-', color="r",label="Training Score")
plt.plot(train_sizes,test_scores_mean, 'o-',color="g",label="Validation Score")
plt.legend(loc="best")
return plt
def plot_validation_curve(estimator, title, X, y,param_name, param_range, ylim=None, cv=None, n_jobs=1, train_sizes=np.linspace(.1,1.0,5)):
train_scores, test_scores=validation_curve(estimator, X, y, param_name, param_range, cv)
train_mean=np.mean(train_scores, axis=1)
train_std=np.std(train_scores, axis=1)
test_mean=np.mean(test_scores, axis=1)
test_std=np.std(test_scores, axis=1)
plt.plot(param_range, train_mean, color='r',marker='o', markersize=5, label='Training Score')
plt.fill_between(param_range, train_mean+train_std, train_mean-train_std, alpha=0.15, color='r')
plt.plot(param_range, test_mean, color='g', linestyle='--', marker='s',markersize=5, label='ValidationScore')
plt.fill_between(param_range,test_mean+test_std, test_mean-test_std, alpha=0.15, color='g')
plt.grid()
plt.xscale('log')
plt.legend(loc='best')
plt.xlabel('Parameter')
plt.ylabel('Score')
plt.ylim(ylim)
来源