名词解释
1.监督学习
监督学习(supervised learning)是指从标注数据中学习预测模型的机器学习问题。标注数据表示输入输出的对应关系,预测模型对给定的输入产生相应的输出。监督学习的本质是学习输入到输出的映射的统计规律。
- 在监督学习中,将输入与输出看作是定义在输入(特征)空间与输出空间上的随机变量的取值。
- 人们根据输入输出变量的不同类型,对预测任务基于不同的名称:输入变量与输出变冷了均为连续变量的预测问题称为回归问题;输出变量为有限个离散变量的预测问题称为分类问题;输入变量与输出变量均为变量序列的预测问题称为标注问题。
学习系统通过不断地尝试,选取最好的模型,以便对训练数据集与足够好的预测,同时对未知的测试数据集的预测也有尽可能好的推广
2 无监督学习
无监督学习(unsupervised learning)是指从无标注数据中学习预测模型的机器学习问题。无标注数据是自然得到的数据,预测模型表示数据的类别、转换或概率。无监督学习的本质是学习数据中的统计规律或潜在结构。
- 每个输入是一个实例,由特征向量表示。每一个输出是对输入的分析结果,由输入的类别、转换或概率表示。模型可以实现对数据的聚类、降维或概率估计。
强化学习
强化学习(reinforcement learning)是指智能系统在与环境的连续互动中学习最优行为策略机器学习问题。假设智能系统与环境的互动基于马尔可夫决策过程(Markov deccision process),智能系统能观测到的是与环境互动得到的数据序列。强化学习的本质是学习最优的序贯决策。
具体分类
常见监督学习模型
- 线性模型
- 决策树
- 神经网络
- 支持向量机
- 贝叶斯分类
- 集成学习模型(RandomForest, GBDT, XGBoost, LightGBM)
代码拆解
读取数据
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore') # 忽略警告
pd.set_option('display.max_columns', None) # 显示所有列
import scipy.stats as st
import seaborn as sns
import missingno as msno # 缺失值可视化的库
import matplotlib as mpl
mpl.use('TkAgg')
import matplotlib.pyplot as plt
warnings.filterwarnings('ignore')
reduce_mem_usage 函数调整数据类型,减少数据在内存中占用的空间
// An highlighted block
def reduce_mem_usage(df):
""" iterate through all the columns of a dataframe and modify the data type
to reduce memory usage.
"""
start_mem = df.memory_usage().sum()
print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
for col in df.columns:
col_type = df[col].dtype
if col_type != object:
c_min = df[col].min()
c_max = df[col].max()
if str(col_type)[:3] == 'int':
if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
df[col] = df[col].astype(np.int8)
elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
df[col] = df[col].astype(np.int16)
elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
df[col] = df[col].astype(np.int32)
elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
df[col] = df[col].astype(np.int64)
else:
if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
df[col] = df[col].astype(np.float16)
elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
df[col] = df[col].astype(np.float32)
else:
df[col] = df[col].astype(np.float64)
else:
df[col] = df[col].astype('category')
end_mem = df.memory_usage().sum()
print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
return df
sample_feature = reduce_mem_usage(pd.read_csv('used_car_train_20200313.csv'))
简单建模
sample_feature = sample_feature.dropna().replace('-', 0).reset_index(drop=True)
sample_feature['notRepairedDamage'] = sample_feature['notRepairedDamage'].astype(np.float32)
train = sample_feature[continuous_feature_names + ['price']]
train_X = train[continuous_feature_names]
train_y = train['price']
from sklearn.linear_model import LinearRegression
model = LinearRegression(normalize=True)
model = model.fit(train_X, train_y)
查看训练的线性回归模型的截距(intercept)与权重(coef)
'intercept:'+ str(model.intercept_)
sorted(dict(zip(continuous_feature_names, model.coef_)).items(), key=lambda x:x[1], reverse=True)
from matplotlib import pyplot as plt
subsample_index = np.random.randint(low=0, high=len(train_y), size=50)
绘制散点图
预测值为蓝,真实值为红,预测值小于0说明模型存在问题
plt.scatter(train_X['v_9'][subsample_index], train_y[subsample_index], color='red')
plt.scatter(train_X['v_9'][subsample_index], model.predict(train_X.loc[subsample_index]), color='blue')
plt.xlabel('v_9')
plt.ylabel('price')
plt.legend(['True Price','Predicted Price'],loc='upper right')
print('The predicted price is obvious different from true price')
plt.show()
作图时发现呈现长尾分布,应当修改为正态分布
此时需要对坐标进行log(x+1)变换
import seaborn as sns
print('It is clear to see the price shows a typical exponential distribution')
plt.figure(figsize=(15,5))
plt.subplot(1,2,1)
sns.distplot(train_y)
plt.subplot(1,2,2)
sns.distplot(train_y[train_y < np.quantile(train_y, 0.9)])
train_y_ln = np.log(train_y + 1)
import seaborn as sns
print('The transformed price seems like normal distribution')
plt.figure(figsize=(15,5))
plt.subplot(1,2,1)
sns.distplot(train_y_ln)
plt.subplot(1,2,2)
sns.distplot(train_y_ln[train_y_ln < np.quantile(train_y_ln, 0.9)])
model = model.fit(train_X, train_y_ln)
print('intercept:'+ str(model.intercept_))
sorted(dict(zip(continuous_feature_names, model.coef_)).items(), key=lambda x:x[1], reverse=True)
然后再次可视化,接近真实值,未出现异常
plt.scatter(train_X['v_9'][subsample_index], train_y[subsample_index], color='red')
plt.scatter(train_X['v_9'][subsample_index], np.exp(model.predict(train_X.loc[subsample_index])), color='blue')
plt.xlabel('v_9')
plt.ylabel('price')
plt.legend(['True Price','Predicted Price'],loc='upper right')
print('The predicted price seems normal after np.log transforming')
plt.show()