总结美赛Y题
过去的几天,参加了美赛春季赛,选择的题目是Y题,现在总结一下用到的模型和代码。
常用的操作
- 数据的各类索引,赋值
- 多维的直方图
- 多维的热力图
- label的分布及对数化后的分布
- 数据集的随机分割,k折交叉验证
数据预处理中分析部分
读取
读取模板
filename = 'filename.xlsx'
sheetname = 0
# col_index = ['', '', '']
df1 = pd.read_excel(filename, sheet_name=sheetname, index_col = col_index )
连续性变量分析
- 提取连续变量, 显示个数模板
con_features = ['', '', '']
df1[con_features].count()
df1[con_features].info()
- 统计探索性分析模板
df1[con_features].describe()
- 查找某一维连续数据数值前k个所代表的样本
#异常值检测
y = df1['your_column'].values
y.argsort()
k = 6 #前k个
y_index = y.argsort()[-k:]
df1.iloc[y_index, :]
分类变量分析(频次是非常重要的特征)
- 查看某一分类数据的频次统计
cat_features = ['', '', '', '']
df1['your_colunms'].value_counts() # series
- 收集频次范围内的特征 待完善
df1['Variant'].value_counts()[df1['Variant'].value_counts().values >= 20].to_excel("感兴趣的单体船.xlsx")
- 收集某一样本频次范围内的样本 待完善
- 分类变量的标签编码
from sklearn.preprocessing import LabelEncoder
data = df
columnname = ''
encoder= LabelEncoder().fit(data[colimnname])
# data[column] =
encoder.transform(data[columnname])
# data.to_excel('问题二.xlsx', index=False)
数据预处理中可视化部分
连续性变量可视化
- label的分布及对数化后label的分布图
import matplotlib.pyplot as plt
your_label = ''
titlename = ''
titlename_log = ' (log)'
plt.rcParams['font.sans-serif']='SimHei'
plt.figure(figsize=(16,5))
plt.subplot(121)
plt.hist(train['your_label'],bins=50)
plt.title(titlename)
plt.subplot(122)
plt.hist(np.log(train[your_label]),bins=50,color='g')
plt.title(titlename_log)
# plt.savefig('distribution of price.jpg', dpi=200, bbox_inches='tight')
plt.show()
- 多维连续变量直方图
import matplotlib.pyplot as plt
con_features = ['Length \n(ft)', 'beam', 'draft mean', 'fuel capacity', 'water capacity',
'number of habour', 'Cost of living index', 'Rent index', 'cost_of_living plus rent index', 'geoceries index'
, 'Res price index', 'local purchase power index']
train = df1
picname = ''
plt.figure()
train[con_features].hist(bins=60,figsize=(16,12))
plt.savefig(picname)
- 连续变量相关性热力图
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
train = df1
con_features = ['beam', 'draft mean', 'number of habour', 'Cost of living index',
'Listing Price (USD)']
plt.figure(figsize=(12,8))
corr=train[con_features].corr()
p1 = sns.heatmap(corr,annot=True,cmap='GnBu')
s1 = p1.get_figure()
# s1.savefig('heat map.jpg',dpi=300,bbox_inches='tight')
分类变量的可视化
- 分类变量有啥可视化呢?
多图可视化
import matplotlib.pyplot as plt
%matplotlib inline
# 查看label的直方图
train = df1
# plt.figure(figsize=(12,8))
plt.subplot(221)
# plt.plot(train.index,train['Listing Price (USD)'])
plt.hist(train['Listing Price (USD)'])
plt.xlabel('index')
plt.ylabel('price')
#plt.savefig('fre figure of label.jpg', dpi=200, bbox_inches='tight')
# plt.legend()
# 查看label的直方图
# plt.figure(figsize=(12,8))
plt.subplot(222)
# plt.plot(train.index,train['Year'])
plt.hist(train['Year'])
plt.xlabel('index')
#plt.ylabel('price')
#plt.savefig('fre figure of label and year.jpg', dpi=200, bbox_inches='tight')
plt.subplot(223)
plt.plot(train.index,train['Listing Price (USD)'])
#plt.hist(train['Listing Price (USD)'])
plt.xlabel('index')
plt.ylabel('price')
#plt.savefig('fre figure of label.jpg', dpi=200, bbox_inches='tight')
# plt.legend()
# 查看label的直方图
# plt.figure(figsize=(12,8))
plt.subplot(224)
plt.plot(train.index,train['Year'])
# plt.hist(train['Year'])
plt.xlabel('index')
#plt.ylabel('price')
plt.savefig('fre figure of label and year.jpg', dpi=200, bbox_inches='tight')
plt.show()
数据预处理中对分类变量标签与one-hot编码
- 对连续变量进行one-hot编码(对机器算法而言更精准)待完善
def demo(df1):
pd.cut(df1['Listing Price (USD)'], 3)
return pd.get_dummies(pd.cut(df1['Listing Price (USD)'], 3), prefix='Listing Price (USD)')
demo(df1)
# df1.loc[:, ['price_degree1', 'price_degree2', 'price_degree3']] = demo(df1).values
# df1.to_excel('双模型数据.xlsx', index=False)
- 对(字符型)分类变量进行直接标签编码(范围0–n-1)
通常作为如spss软件进行分类变量分析时的前置步骤,不能直接用于机器学习!
模型输入的x,y划分及训练集验证集划分(学术上表示模型性能)
- 随机将数据集分成指定比例的训练集和测试集
# 导入数据,路径中要么用\\或/或者在路径前加r
dataset = df1
# 准备训练数据
# 自变量:汽油税、人均收入、高速公路、人口所占比例
# 因变量:汽油消耗量
# 这里可以改成更方便的索引
X = dataset.iloc[:, 0:16].values
y = dataset.iloc[:, 18].values
# 将数据分为训练集和测试集
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,
y,
test_size=0.2,
random_state=0)
# 特征缩放,通常没必要
# 因为数据单位,自变量数值范围差距巨大,不缩放也没问题
# 理论上讲,随机森林算法对量纲不敏感,因此不需要做标准化
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
- k折交叉验证法
import xgboost as xgb
import pickle
from sklearn.metrics import mean_absolute_error,make_scorer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from scipy.sparse import csr_matrix,hstack
from sklearn.model_selection import KFold,train_test_split
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split,StratifiedKFold,train_test_split,GridSearchCV
# 划分x和label
train = df1
ntrain=train.shape[0]
others = ['', '', '']
features=[x for x in train.columns
if x not in [others]]
train_x=train[features]
train_y=train['price(log)']
print('Xtrain:',train_x.shape)
print('ytrain:',train_y.shape)