1.特征工程
1.1 数据预处理
1. 缺失值
pd.describe()查看数据情况
explore = alldata.describe(include = 'all').T
explore['null'] = len(alldata) - explore['count']#查看每行缺失值,并添加缺失值的列
explore.insert(0,'dtype',alldata.dtypes) #添加每行数据类型
explore.T.to_csv('explore2.csv')
explore.T
查看数据是否有缺失值
pd.isnull(train).values.any()
查看缺失值情况
#查看缺失值情况
def missing_values(alldata):
alldata_na = pd.DataFrame(alldata.isnull().sum(), columns={'missingNum'})
alldata_na['missingRatio'] = alldata_na['missingNum']/len(alldata)*100
alldata_na['existNum'] = len(alldata) - alldata_na['missingNum']
alldata_na['train_notna'] = len(train) - train.isnull().sum()
alldata_na['test_notna'] = alldata_na['existNum'] - alldata_na['train_notna']
alldata_na['dtype'] = alldata.dtypes
alldata_na = alldata_na[alldata_na['missingNum']>0].reset_index().sort_values(by=['missingNum','index'],ascending=[False,True])
alldata_na.set_index('index',inplace=True)
return alldata_na
alldata_na = missing_values(alldata)
alldata_na
使用missingno库查看缺失值情况
- 缺失值分布图
- 直方图
- 相关性
详情戳:https://blog.csdn.net/weixin_43746433/article/details/97022591 第3小节
import missingno as msno
msno.matrix(players.sample(500),
figsize=(16, 7),
width_ratios=(15, 1))
#透视表缺失值
def time_slice(df, time_period):
# Only take data for time period of interest
df = df[df.time_period==time_period]
# Pivot table
df = df.pivot(index='country', columns='variable', values='value')
df.columns.name = time_period
return df
recent = time_slice(data, '2013-2017')
msno.matrix(recent, labels=True)
求出每列的空值总和,并去除空值
import pandas as pd
loans = pd.read_csv('filtered_loans_2007.csv')
null_counts = loans.isnull().sum()
#求出每列缺失值的占比
data_na=((null_counts/loans.shape[0]))*100
#筛选出有缺失值的列的占比
all_data_na=data_na.drop(data_na[data_na==0].index).sort_values(ascending=False)
loans = loans.drop("pub_rec_bankruptcies", axis=1) #删除空值多的列
loans = loans.dropna(axis=0) #删除空值所在的行
插值法
缺失值处理:补充缺失的数据
- 取中位数,平均值,或最近相邻的值
- 三种方法:Lagrange插值法和Newton插值法以及Series自带的interpolate
朗格拉日插值法
# 自定义列向量插值函数
# s为列向量, n为被插值的位置, k为取前后的数据个数, 默认为5
from scipy.interpolate import lagrange #导入拉格朗日插值函数
data1 = pd.read_excel(inputfile, header=None,names=['A','B','C'])
def plotinterplate_columns(s, n, k=5):
y = s[list(range(n-k,n) + list(range(n+1, n+1+k)))]
y = y[y.notnull()]#剔除空值
return lagrange(y.index, list(y))(n)#向位置n出插值并返回该插值结果
# 逐个判断每列是否需要插值
lagij = []
for i in data1.columns:
for j in range(len(data1)):
if (data1[i].isnull())[j]:
data1[i][j] = plotinterplate_columns(data1[i],j)
lagij.append((i,j,data1[i][j]))
print data1
data1.to_csv('lagrange.csv')
from pandas import Series
Series(lagij).to_csv('lagij.csv')
2. 重复值
查看数据重复
- 数据这两列有相关性,有重复值
data[['variable','variable_full']].drop_duplicates()
每列值的相关性corr()绘图
recent_corr = recent.corr().loc['gdp_per_capita'].drop(['gdp','gdp_per_capita'])
def conditional_bar(series, bar_colors=None, color_labels=None, figsize=(13,24),
xlabel=None, by=None, ylabel=None, title=None):
fig, ax = plt.subplots(figsize=figsize)
if not bar_colors:
bar_colors = mpl.rcParams['axes.prop_cycle'].by_key()['color'][0]
plt.barh(range(len(series)),series.values, color=bar_colors)
plt.xlabel('' if not xlabel else xlabel);
plt.ylabel('' if not ylabel else ylabel)
plt.yticks(range(len(series)), series.index.tolist())
plt.title('' if not title else title);
plt.ylim([-1,len(series)]);
if color_labels:
for col, lab in color_labels.items():
plt.plot([], linestyle='',marker='s',c=col, label= lab);
lines, labels = ax.get_legend_handles_labels();
ax.legend(lines[-len(color_labels.keys()):], labels[-len(color_labels.keys()):], loc='upper right');
plt.close()
return fig
bar_colors = ['#0055A7' if x else '#2C3E4F' for x in list(recent_corr.values < 0)]
color_labels = {'#0055A7':'Negative correlation', '#2C3E4F':'Positive correlation'}
conditional_bar(recent_corr.apply(np.abs), bar_colors, color_labels,
title='Magnitude of correlation with GDP per capita, 2013-2017',
xlabel='|Correlation|')
根据数据特征的相关度筛选特征
- 查看数据特征的相关度大于0.5的corr()
print(corr[corrt['y']>0.5])
- 相关度影响最大的前10列
#查看影响最终价格的十个变量,并绘制热力图
k = 10
plt.figure(figsize=(12,9))
cols = corr.nlargest(k, 'y')['y']
print(cols)
cols=cols.index
print(cols)
sns.set(font_scale=1.25)
hm = sns.heatmap(data[cols].corr(), cbar=True, annot=True, square=True, fmt='.2f', annot_kws={'size': 10}, yticklabels=cols.values, xticklabels=cols.values)
plt.show()
去除特征相似的列
文章如下:https://blog.csdn.net/weixin_43746433/article/details/95353814
data = pd.read_csv('../data/basalt.csv')
association = data.corr()
correlation_mat=association[cont_features]#数值类型
sns.heatmap(correlation_mat, annot=True)#画出热力图
#data.corr() #相关系数矩阵,即给出了任意两个变量之间的相关系数
#data.corr()[u'好'] #只显示“好”与其他感情色彩的相关系数
#data[u'好'].corr(data[u'哭']) #两个感情色彩的相关系数
print(association.head())
# 数据预处理
delSimCol = []
colNum = association.shape[0]###列
print(association.shape[1])
print(colNum)
names = association.columns
for i in range(colNum):
for j in range(i+1,colNum):
if association.iloc[i,j]>0.9:
delSimCol.append((names[i],names[j]))
print('经过筛选得到的相似的属性为:\n',delSimCol)
delCol = [i[1] for i in delSimCol]
data.drop(delCol,axis=1,inplace = True) # 删除列
数据聚合后的重复值
def get_subgroup(dataframe, g_index, g_columns):
"""Helper function that creates a sub-table from the columns and runs a quick uniqueness test."""
g = dataframe.groupby(g_index).agg({col:'nunique' for col in g_columns})
if g[g > 1].dropna().shape[0] != 0:
print("Warning: you probably assumed this had all unique values but it doesn't.")
return dataframe.groupby(g_index).agg({col:'max' for col in g_columns})
3. 删除无用列
保留列中需要合并的值
- isin()接受一个列表,判断该列中元素是否在列表中
- position_agg为新列
defense = ['Center Back','Defensive Midfielder', 'Left Fullback', 'Right Fullback', ]
df.loc[players['position'].isin(defense), 'position_agg'] = "Defense"
各列中唯一属性的个数
#查看字符类型的列名
cont_features = [cont for cont in list(data.select_dtypes(
include=['float64', 'int64']).columns) if cont not in ['loss', 'id']]
print(len(cont_features))
print(cont_features)
cat_uniques = []
for cat in cat_features:
cat_uniques.append(len(data[cat].unique()))
uniq_values_in_categories = pd.DataFrame.from_items([('cat_name', cat_features), ('unique_values', cat_uniques)])
删除只有一种值的列
orig_columns = loans_2007.columns
drop_columns = []
for col in orig_columns:
col_series = loans_2007[col].dropna().unique() #先清除缺失值,否则会多一个唯一值
if len(col_series) == 1:
drop_columns.append(col)
loans_2007 = loans_2007.drop(drop_columns, axis=1)
print(drop_columns)
4. 异常值
1. 数据倾斜
- 查看数据
recent[['total_pop', 'urban_pop', 'rural_pop']].describe().astype(int)
- 计算所有非object类型的skew(偏度)
from scipy.stats import norm,skew
numeric_feats = alldata.dtypes[alldata.dtypes !='object'].index
#print(num_feats)
skew_feats = alldata[numeric_feats].apply(lambda x: skew(x.dropna())).sort_values(ascending=False)
skewness=pd.DataFrame({'Skew':skew_feats})
skewness.head()
- 计算偏度
- 左偏:均值 < 中位数
- 右偏:均值 > 中位数
recent[['total_pop', 'urban_pop', 'rural_pop']].apply(scipy.stats.skew)
- 计算峰度
recent[['total_pop', 'urban_pop', 'rural_pop']].apply(scipy.stats.kurtosis)
- 减小偏度峰度
recent[['total_pop']].apply(np.log).apply(scipy.stats.skew)
recent[['total_pop']].apply(np.log).apply(scipy.stats.kurtosis)
- 绘图
def plot_hist(df, variable, bins=20, xlabel=None, by=None,
ylabel=None, title=None, logx=False, ax=None):
if not ax:
fig, ax = plt.subplots(figsize=(12,8))
if logx:
if df[variable].min() <=0:
df[variable] = df[variable] - df[variable].min() + 1
print('Warning: data <=0 exists, data transformed by %0.2g before plotting' % (- df[variable].min() + 1))
bins = np.logspace(np.log10(df[variable].min()),
np.log10(df[variable].max()), bins)
ax.set_xscale("log")
ax.hist(df[variable].dropna().values, bins=bins);
if xlabel:
ax.set_xlabel(xlabel);
if ylabel:
ax.set_ylabel(ylabel);
if title:
ax.set_title(title);
return ax
plot_hist(recent, 'total_pop', bins=25, logx=True, #logx=True 对数据进行处理
xlabel='Log of total population', ylabel='Number of countries',
title='Distribution of total population of countries 2013-2017');
2. 清除异常值四分位法
def detect_outliers2(df):
outlier_indices = []
# 1st quartile (25%)
Q1 = np.percentile(df, 25)
# 3rd quartile (75%)
Q3 = np.percentile(df, 75)
# Interquartile range (IQR)
IQR = Q3 - Q1
# outlier step
outlier_step = 1.5 * IQR
for nu in df:
if (nu < Q1 - outlier_step) | (nu > Q3 + outlier_step):
df.remove(nu)
return df
1.2 数据标准化
1.2.1 处理数值型属性
- 简单函数 规范化 按照比例缩放
# 简单函数 规范化 按照比例缩放
numeric_feats = alldata.dtypes[alldata.dtypes != "object"].index
t = alldata[numeric_feats].quantile(.75) # 取四分之三分位
use_75_scater = t[t != 0].index
alldata[use_75_scater] = alldata[use_75_scater]/alldata[use_75_scater].quantile(.75)
1.2.2 标准化数据使符合正态分布
from scipy.special import boxcox1p
t = ['LotFrontage', 'LotArea', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF']
# alldata.loc[:, t] = np.log1p(alldata.loc[:, t])
train["SalePrice"] = np.log1p(train["SalePrice"]) # 对于SalePrice (labels)采用log1p较好---np.expm1(clf1.predict(X_test))
lam = 0.15 # 100 * (1-lam)% confidence
for feat in t:
alldata[feat] = boxcox1p(alldata[feat], lam) # 对于其他属性,采用boxcox1p较好
1.3 特征提取
1. 转换列表中的值
保留列中需要的两值变为0和1 (target)
print(loans_2007['loan_status'].value_counts())
loans_2007 = loans_2007[(loans_2007['loan_status'] == "Fully Paid") | (loans_2007['loan_status'] == "Charged Off")]
status_replace = {
"loan_status" : {
"Fully Paid": 1,
"Charged Off": 0,
}
}
loans_2007 = loans_2007.replace(status_replace)
关键字信息提取
def position_detail():
lagou_df2 = pd.read_csv('./lagou_data5.csv', encoding='gbk')
lagou_df2 = lagou_df2[['position_detail', 'salary']]
# 提取Python信息
for i, j in enumerate(lagou_df2['position_detail']):
if 'python' in j:
lagou_df2['position_detail'][i] = j.replace('python', 'Python')
lagou_df2['Python'] = pd.Series()
for i, j in enumerate(lagou_df2['position_detail']):
if 'Python' in j:
lagou_df2['Python'][i] = 1
else:
lagou_df2['Python'][i] = 0
print('Python\n', lagou_df2['Python'].value_counts())
2. 转换列的字符类型
print(loans.dtypes.value_counts())#查看类型个数
object_columns_df = loans.select_dtypes(include=["object"])
##查看列的各个特征值的数量,判断是更改还是删除
cols = ['home_ownership', 'verification_status', 'emp_length', 'term', 'addr_state']
for c in cols:
print(loans[c].value_counts())
mapping_dict = {
"emp_length": {
"10+ years": 10,
"9 years": 9,
"8 years": 8,
"7 years": 7,
"6 years": 6,
"5 years": 5,
"4 years": 4,
"3 years": 3,
"2 years": 2,
"1 year": 1,
"< 1 year": 0,
"n/a": 0
}
}
loans = loans.drop(["last_credit_pull_d", "earliest_cr_line", "addr_state", "title"], axis=1)#删除列中值太多的列
loans["int_rate"] = loans["int_rate"].str.rstrip("%").astype("float")#去除%,转换为模型需要的数据类型
loans = loans.replace(mapping_dict)
或者:
simple_regions ={
'World | Asia':'Asia',
'Americas | Central America and Caribbean | Central America': 'North America',
'Americas | Northern America | Northern America': 'North America',
'Americas | Northern America | Mexico': 'North America',
'Americas | Southern America | Guyana':'South America',
'Americas | Southern America | Southern America':'South America',
'World | Africa':'Africa',
'World | Europe':'Europe',
'World | Oceania':'Oceania'
}
# simplify regions
data.region = data.region.apply(lambda x: simple_regions[x])
3. 使用sklearn标签映射
from sklearn.preprocessing import LabelEncoder
cols=('a','b','c')
for c in cols:
lbl=LabelEncoder()
lbl.fit(list(df[c].values))
df[c]=lbl.transform(list(df[c].values))
4. 字符ont-hot编码
cat_columns = ["home_ownership", "verification_status", "emp_length", "purpose", "term"]
dummy_df = pd.get_dummies(loans[cat_columns])
loans = pd.concat([loans, dummy_df], axis=1)
loans = loans.drop(cat_columns, axis=1)
2.准确率
2.1 通过真实值与预测值的对比
def run_cv(X,y,clf_class,**kwargs):
# Construct a kfolds object
kf = KFold(n_folds=5,shuffle=True)
y_pred = y.copy()#标签
# Iterate through folds
for train_index, test_index in kf:
X_train, X_test = X[train_index], X[test_index]
y_train = y[train_index]
# Initialize a classifier with key word arguments
clf = clf_class(**kwargs)
clf.fit(X_train,y_train)
y_pred[test_index] = clf.predict(X_test)
return y_pred
accuracy= np.mean(y ==run_cv(X,y,SVC)) #真实值与预测值
print "Support vector machines:"
print "%.3f" % accuracy )
2.2 预测总正确率(正确个数/总个数)
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_predict
loans = pd.read_csv("cleaned_loans2007.csv")
lr = LogisticRegression(class_weight="balanced")#样本均衡
kf = KFold(5,shuffle=True, random_state=1)
predictions = cross_val_predict(lr, features, target, cv=kf)
predictions = pd.Series(predictions)
loans['predicted_label']=predictions
matches = loans["predicted_label"] == loans["loan_status"] #
#print('matches',matches)
correct_predictions = loans[matches]
print('len(correct_predictions)',len(correct_predictions))
print('float(len(admissions)',float(len(loans)))
accuracy = len(correct_predictions) / float(len(loans))
print('准确率',accuracy)
2.3 预测类别正确率
true_positive_filter = (admissions["predicted_label"] == 1) & (admissions["actual_label"] == 1)
true_positives = len(admissions[true_positive_filter])
false_negative_filter = (admissions["predicted_label"] == 0) & (admissions["actual_label"] == 1)
false_negatives = len(admissions[false_negative_filter])
sensitivity = true_positives / float((true_positives + false_negatives))
print(sensitivity)
2.3 交叉验证
kf = KFold(5, shuffle=True, random_state=8)
lr = LogisticRegression()
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(lr,admissions[["gpa"]], admissions["actual_label"], scoring="roc_auc", cv=kf)
average_accuracy = sum(accuracies) / len(accuracies)
print(accuracies)
print(average_accuracy)
2.4 ROC曲线(二分类)
- ROC曲线下面积(AUC)值越大,性能越好
from sklearn import metrics
probabilities = model.predict_proba(test[["gpa"]])
fpr, tpr, thresholds = metrics.roc_curve(test["actual_label"], probabilities[:,1])
print(thresholds)
plt.plot(fpr, tpr)
plt.show()
3. 寻找最优模型
3.1 K折验证得到最优模型
from sklearn.model_selection import KFold
def run_cv(X,y,clf_class,**kwargs):
# Construct a kfolds object
kf = KFold(n_folds=5,shuffle=True)
y_pred = y.copy()
# Iterate through folds
for train_index, test_index in kf:
X_train, X_test = X[train_index], X[test_index]
y_train = y[train_index]
# Initialize a classifier with key word arguments
clf = clf_class(**kwargs)
clf.fit(X_train,y_train)
y_pred[test_index] = clf.predict(X_test)
return y_pred
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier as RF
from sklearn.neighbors import KNeighborsClassifier as KNN
#得到预测的结果
def accuracy(y_true,y_pred):
#numpy将“真”和“假”解释为1。和0。
return np.mean(y_true == y_pred)#真实值与预测值
print "Support vector machines:"
print "%.3f" % accuracy(y, run_cv(X,y,SVC))
print "Random forest:"
print "%.3f" % accuracy(y, run_cv(X,y,RF))
print "K-nearest-neighbors:"
print "%.3f" % accuracy(y, run_cv(X,y,KNN))
4.寻找模型最优参数
4.1 使用GridSearchCV()函数设置XGBoostRegressor参数
它存在的意义就是自动调参,只要把参数输进去,就能给出最优化的结果和参数。但是这个方法适合于小数据集,一旦数据的量级上去了,很难得出结果。
重要属性及方法:
-
cv_results_:旧版本是“grid_scores_”,cv_results_是详尽、升级版。内容较好理解,包含了’mean_test_score’(验证集平均得分),‘rank_test_score’(验证集得分排名),‘params’(dict形式存储所有待选params的组合),甚至还有在每次划分的交叉验证中的得分(‘split0_test_score’、 'split1_test_score’等),就是输出的内容稍显臃肿。内容以dict形式输出,我们可以转成DataFrame形式,看起来稍微养眼一点。
-
best_params_ : dict:最佳参数组合
-
best_score_ : float:cv_results_属性中,'mean_test_score’里面的最高分。即你验证集得到的最高分数
-
best_estimator_ : estimator or dict:得到打分最高的超参组合对应的estimator
-
fit()/predict():用网格搜索得到的最佳超参所构建的estimator对数据集进行fit、predict
get_params():这个和‘best_estimator_ ’这个属性相似,但可以得到这个模型更多的参数
xgb_param_grid = {'max_depth': list(range(4,9)), 'min_child_weight': list((1,3,6))}
xgb_param_grid['max_depth'] #[4,5,6,7,8]
grid = GridSearchCV(XGBoostRegressor(eta=0.1, num_boost_round=50, colsample_bytree=0.5, subsample=0.5),
param_grid=xgb_param_grid, cv=5, scoring=mae_scorer)
grid.fit(train_x, train_y.values)
grid.grid_scores_, grid.best_params_, grid.best_score_
4.2 寻找决策树最优参数
树模型参数:
-
1.criterion gini or entropy
-
2.splitter best or random 前者是在所有特征中找最好的切分点 后者是在部分特征中(数据量大的时候)
-
3.max_features None(所有),log2,sqrt,N 特征小于50的时候一般使用所有的
-
4.max_depth 数据少或者特征少的时候可以不管这个值,如果模型样本量多,特征也多的情况下,可以尝试限制下
-
5.min_samples_split 如果某节点的样本数少于min_samples_split,则不会继续再尝试选择最优特征来进行划分如果样本量不大,不需要管这个值。如果样本量数量级非常大,则推荐增大这个值。
-
6.min_samples_leaf 这个值限制了叶子节点最少的样本数,如果某叶子节点数目小于样本数,则会和兄弟节点一起被剪枝,如果样本量不大,不需要管这个值,大些如10W可是尝试下5
-
7.min_weight_fraction_leaf 这个值限制了叶子节点所有样本权重和的最小值,如果小于这个值,则会和兄弟节点一起被剪枝默认是0,就是不考虑权重问题。一般来说,如果我们有较多样本有缺失值,或者分类树样本的分布类别偏差很大,就会引入样本权重,这时我们就要注意这个值了。
-
8.max_leaf_nodes 通过限制最大叶子节点数,可以防止过拟合,默认是"None”,即不限制最大的叶子节点数。如果加了限制,算法会建立在最大叶子节点数内最优的决策树。如果特征不多,可以不考虑这个值,但是如果特征分成多的话,可以加以限制具体的值可以通过交叉验证得到。
-
9.class_weight 指定样本各类别的的权重,主要是为了防止训练集某些类别的样本过多导致训练的决策树过于偏向这些类别。这里可以自己指定各个样本的权重如果使用“balanced”,则算法会自己计算权重,样本量少的类别所对应的样本权重会高。
-
10.min_impurity_split 这个值限制了决策树的增长,如果某节点的不纯度(基尼系数,信息增益,均方差,绝对差)小于这个阈值则该节点不再生成子节点。即为叶子节点 。
-
n_estimators:要建立树的个数
from sklearn.grid_search import GridSearchCV
tree_param_grid = { 'min_samples_split': list((3,6,9)),'n_estimators':list((10,50,100))}
grid = GridSearchCV(RandomForestRegressor(),param_grid=tree_param_grid, cv=5)
grid.fit(data_train, target_train)
grid.grid_scores_, grid.best_params_, grid.best_score_
4.3 线性支持向量机 LinearSVC
from sklearn.svm import SVC
#from sklearn.decomposition import RandomizedPCA
from sklearn.decomposition import PCA
from sklearn.pipeline import make_pipeline
pca = PCA(n_components=150, whiten=True, random_state=42)
svc = SVC(kernel='rbf', class_weight='balanced')
model = make_pipeline(pca, svc)
from sklearn.model_selection import train_test_split
Xtrain, Xtest, ytrain, ytest = train_test_split(faces.data, faces.target,
random_state=40)
from sklearn.model_selection import GridSearchCV
param_grid = {'svc__C': [1, 5, 10],
'svc__gamma': [0.0001, 0.0005, 0.001]}
grid = GridSearchCV(model, param_grid)
%time grid.fit(Xtrain, ytrain)
print(grid.best_params_)