import warnings
warnings.filterwarnings("ignore")
import matplotlib.pyplot as plt
plt.rcParams.update({
'figure.max_open_warning': 0})
import seaborn as sns
# modelling
import pandas as pd
import numpy as np
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, RepeatedKFold, cross_val_score,cross_val_predict,KFold
from sklearn.metrics import make_scorer,mean_squared_error
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.svm import LinearSVR, SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor,AdaBoostRegressor
from xgboost import XGBRegressor
from sklearn.preprocessing import PolynomialFeatures,MinMaxScaler,StandardScaler
# ====================================================================
#load_dataset
with open("datalab/7811/zhengqi_train.txt") as fr:
data_train=pd.read_table(fr,sep="\t")
with open("datalab/7811/zhengqi_test.txt") as fr_test:
data_test=pd.read_table(fr_test,sep="\t")
#merge train_set and test_set
data_train["oringin"]="train"
data_test["oringin"]="test"
data_all=pd.concat([data_train,data_test],axis=0,ignore_index=True)
#View data
data_all.head()
# ====================================================================
# Explore feature distibution
#fig = plt.figure(figsize=(6, 6))
for column in data_all.columns[0:-2]:
g = sns.kdeplot(data_all[column][(data_all["oringin"] == "train")], color="Red", shade = True)
g = sns.kdeplot(data_all[column][(data_all["oringin"] == "test")], ax =g, color="Blue", shade= True)
g.set_xlabel(column)
g.set_ylabel("Frequency")
g = g.legend(["train","test"])
plt.show()
fig = plt.figure(figsize=(10, 10))
for i in range(len(data_all.columns)-2):
g = sns.FacetGrid(data_all, col='oringin')
g = g.map(sns.distplot, data_all.columns[i])
# ====================================================================
#删除特征"V5","V9","V11","V17","V22","V28",训练集和测试集分布不均
for column in ["V5","V9","V11","V17","V22","V28"]:
g = sns.kdeplot(data_all[column][(data_all["oringin"] == "train")], color="Red", shade = True)
g = sns.kdeplot(data_all[column][(data_all["oringin"] == "test")], ax =g, color="Blue", shade= True)
g.set_xlabel(column)
g.set_ylabel("Frequency")
g = g.legend(["train","test"])
plt.show()
data_all.drop(["V5","V9","V11","V17","V22","V28"],axis=1,inplace=True)
# ====================================================================
# figure parameters
data_train1=data_all[data_all["oringin"]=="train"].drop("oringin",axis=1)
fcols = 2
frows = len(data_train.columns)
plt.figure(figsize=(5*fcols,4*frows))
i=0
for col in data_train1.columns:
i+=1
ax=plt.subplot(frows,fcols,i)
sns.regplot(x=col, y='target', data=data_train, ax=ax,
scatter_kws={
'marker':'.','s':3,'alpha':0.3},
line_kws={
'color':'k'});
plt.xlabel(col)
plt.ylabel('target')
i+=1
ax=plt.subplot(frows,fcols,i)
sns.distplot(data_train[col].dropna() , fit=stats.norm)
plt.xlabel(col)
# ====================================================================
# 找出相关程度
plt.figure(figsize=(20, 16)) # 指定绘图对象宽度和高度
colnm = data_train1.columns.tolist() # 列表头
mcorr = data_train1[colnm].corr(method="spearman") # 相关系数矩阵,即给出了任意两个变量之间的相关系数
mask = np.zeros_like(mcorr, dtype=np.bool) # 构造与mcorr同维数矩阵 为bool型
mask[np.triu_indices_from(mask)] = True # 角分线右侧为True
cmap = sns.diverging_palette(220, 10, as_cmap=True) # 返回matplotlib colormap对象
g = sns.heatmap(mcorr, mask=mask, cmap=cmap, square=True, annot=True, fmt='0.2f') # 热力图(看两两相似度)
plt.show()
# ====================================================================
# normalise numeric columns
cols_numeric=list(data_all.columns)
cols_numeric.remove("oringin")
def scale_minmax(col):
return (col-col.min())/(col.max()-col.min())
scale_cols = [col for col in cols_numeric if col!='target']
data_all[scale_cols] = data_all[scale_cols].apply(scale_minmax,axis=0)
data_all[scale_cols].describe()
# ====================================================================
#Check effect of Box-Cox transforms on distributions of continuous variables
fcols = 6
frows = len(cols_numeric)-1
plt.figure(figsize=(4*fcols,4*frows))
i=0
for var in cols_numeric:
if var!='target':
dat = data_all[[var, 'target']].dropna()
i+=1
plt.subplot(frows,fcols,i)
sns.distplot(dat[var] , fit=stats.norm);
plt.title(var+' Original')
plt.xlabel(
【天池新人赛-工业蒸汽量预测】5.另一份值得学习的代码参考
最新推荐文章于 2023-09-23 20:38:25 发布