你怎么从全世界找到喜欢你的人?
select * from world where someone like ‘%you%’;
no results!
项目工程和数据集我上传了集成学习:随机森林、GBDT、XGBoost实战代码合集
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import pandas as pd
import warnings
import sklearn
from sklearn.linear_model import LinearRegression, LassoCV, Ridge, ElasticNetCV
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.linear_model.coordinate_descent import ConvergenceWarning
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from sklearn.ensemble import BaggingRegressor,AdaBoostRegressor,GradientBoostingRegressor
## 设置字符集,防止中文乱码
mpl.rcParams['font.sans-serif']=[u'simHei']
mpl.rcParams['axes.unicode_minus']=False
## 拦截异常
warnings.filterwarnings(action = 'ignore', category=ConvergenceWarning)
def notEmpty(s):
return s != ''
## 加载数据
names = ['CRIM','ZN', 'INDUS','CHAS','NOX','RM','AGE','DIS','RAD','TAX','PTRATIO','B','LSTAT']
path = "datas/boston_housing.data"
## 由于数据文件格式不统一,所以读取的时候,先按照一行一个字段属性读取数据,然后再安装每行数据进行处理
fd = pd.read_csv(path,header=None)
# print (fd.shape)
data = np.empty((len(fd), 14))
for i, d in enumerate(fd.values):#enumerate生成一列索 引i,d为其元素
d = map(float, filter(notEmpty, d[0].split(' ')))#filter一个函数,一个list
#根据函数结果是否为真,来过滤list中的项。
data[i] = list(d)
## 分割数据
x, y = np.split(data, (13,), axis=1)
print (x[0:5])
y = y.ravel() # 转换格式 拉直操作
print (y[0:5])
ly=len(y)
print(y.shape)
print ("样本数据量:%d, 特征个数:%d" % x.shape)
print ("target样本数据量:%d" % y.shape[0])
[[6.3200e-03 1.8000e+01 2.3100e+00 0.0000e+00 5.3800e-01 6.5750e+00
6.5200e+01 4.0900e+00 1.0000e+00 2.9600e+02 1.5300e+01 3.9690e+02
4.9800e+00]
[2.7310e-02 0.0000e+00 7.0700e+00 0.0000e+00 4.6900e-01 6.4210e+00
7.8900e+01 4.9671e+00 2.0000e+00 2.4200e+02 1.7800e+01 3.9690e+02
9.1400e+00]
[2.7290e-02 0.0000e+00 7.0700e+00 0.0000e+00 4.6900e-01 7.1850e+00
6.1100e+01 4.9671e+00 2.0000e+00 2.4200e+02 1.7800e+01 3.9283e+02
4.0300e+00]
[3.2370e-02 0.0000e+00 2.1800e+00 0.0000e+00 4.5800e-01 6.9980e+00
4.5800e+01 6.0622e+00 3.0000e+00 2.2200e+02 1.8700e+01 3.9463e+02
2.9400e+00]
[6.9050e-02 0.0000e+00 2.1800e+00 0.0000e+00 4.5800e-01 7.1470e+00
5.4200e+01 6.0622e+00 3.0000e+00 2.2200e+02 1.8700e+01 3.9690e+02
5.3300e+00]]
[24. 21.6 34.7 33.4 36.2]
(506,)
样本数据量:506, 特征个数:13
target样本数据量:506
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=28)
## 线性回归模型
lr = Ridge(alpha=0.1)
lr.fit(x_train, y_train)
print("训练集上R^2:%.5f" % lr.score(x_train, y_train))
print("测试集上R^2:%.5f" % lr.score(x_test, y_test))
训练集上R^2:0.77123
测试集上R^2:0.56367
# 使用Bagging思想集成线性回归
bg = BaggingRegressor(Ridge(alpha=0.1), n_estimators=50, max_samples=0.7, max_features=0.8, random_state=28)
bg.fit(x_train, y_train)
print("训练集上R^2:%.5f" % bg.score(x_train, y_train))
print("测试集上R^2:%.5f" % bg.score(x_test, y_test))
训练集上R^2:0.76210
测试集上R^2:0.57320
# 使用AdaBoostRegressor
adr = AdaBoostRegressor(LinearRegression(), n_estimators=100, learning_rate=0.001, random_state=14)
adr.fit(x_train, y_train)
print("训练集上R^2:%.5f" % adr.score(x_train, y_train))
print("测试集上R^2:%.5f" % adr.score(x_test, y_test))
训练集上R^2:0.77334
测试集上R^2:0.56307
# 使用AdaBoostRegressor; GBDT模型只支持CART模型
gbdt = GradientBoostingRegressor(n_estimators=100, learning_rate=0.01, random_state=14)
gbdt.fit(x_train, y_train)
print("训练集上R^2:%.5f" % gbdt.score(x_train, y_train))
print("测试集上R^2:%.5f" % gbdt.score(x_test, y_test))
训练集上R^2:0.76489
测试集上R^2:0.64161