IDEA中实现Python随机森林模型预测人口

不要随地大便

已于 2024-07-17 10:15:11 修改

阅读量6.9k

点赞数 14

文章标签：机器学习 python idea 随机森林预测人口

于 2022-02-18 20:39:10 首次发布

本文链接：https://blog.csdn.net/qq_48273925/article/details/123010718

版权

数据集链接在评论区噢。训练数据中有22个自变量（valuexx是某种土地利用面积），因变量是最后的人口，每一行数据都是一个县市的数据，根据训练数据得到这22个自变量与因变量人口之间的函数关系式y=ax+by+z.....(a、b以及后面的省略号中都是常数，x、y等就是22个自变量的值，这个不一定时22，不同的模型可能会自动进行主成分分析，那时候自变量就没有22个了)，当新的数据（也就是要预测人口的数据，包含一样的22个自变量字段以及相同的单位和先后排列顺序）使用该模型进行预测时将会使用y=ax+by+z.....，并且将22个对应的自变量套进去，得到新的y，即预测人口数量。

# ！/usr/bin/env python
# -*- coding:utf-8-*-
# 读取数据
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from dataframe import dataframe
import time
pd.set_option('max_colwidth', 10)  # 显示5列
# 字段名为中文，编码方式指定为gbk
temp_data = pd.read_csv('D:\桌面\我的实验\长沙市数据.csv', encoding='gbk')
# temp_data = pd.read_csv('temp_data.csv')
# print("输出读取的表格行数和列数：",temp_data.shape)
# print("输出前面6行数据：\n",temp_data.head())
# 查看基本信息
# print(temp_data.info())#none
# print("输出所有变量的特征值\n",temp_data.describe())
y = temp_data['人口']  # 表明这个列是作为目标数据（因变量）Y
# 使用.drop(['列名'],axis=1)方法表示该列（自变量）不参与计算
X = temp_data.drop(['人口'], axis=1)

#######训练模型啦！！！！！！！！！！！！
from sklearn.model_selection import train_test_split  # 用来划分训练集和测试集
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

# 划分训练集和测试集 X是自变量，Y是因变量
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)
# print(X_train.shape)
# print(y_train.shape)
# print(X_test.shape)
# print(y_test.shape)
# 所有数据准备完毕
# 存储一下目前的列名
features = list(X_train.columns)
# print("输出表头：",features)
# 我们先用默认参数训练
rf0 = RandomForestRegressor(random_state=0)
rf0.fit(X_train, y_train)
#######训练模型！！！！！！！！！！！！！！！

###########开始预测模型精度！！！！！！
y_predict0 = rf0.predict(X_test)#  开始预测之前就分好的训练集
error0 = mean_absolute_error(y_test, y_predict0)
print('当使用随机森林默认参数时，平均绝对误差为：', error0)
# errors = []
# print(errors.append(error0))
# 看一下特征重要性
importances = list(rf0.feature_importances_)
print("特征权重值：", importances)
start_time=time.time()  # 模型运行开始时间
# column和重要度组合起来
feature_importances = [(feature, round(importance, 3)) for feature, importance in zip(features, importances)]
# print(feature_importances)
# # 排序
# feature_importances = sorted(feature_importances, key=lambda x: x[1], reverse=True)
print(feature_importances)
# 开始调参
rf0.get_params
# 因为数据量不大，所以我们直接选择网格搜索来选择最佳参数
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
# 设置搜索参数
param_grid = {
    # 'bootstrap': [True],  # 是否对样本集进行有放回抽样来构建树
    # 'max_depth': [1],  # 决策树最大深度
    # 'max_features': ['auto'],  # 构建决策树最优模型时考虑的最大特征数。默认是”auto“，表示最大特征数是N的平方根
    # 'min_samples_leaf': [20],  # 叶子节点最少样本数
    # 'min_samples_split': [2, 11, 22],  # 内部节点再划分所需最小样本数
    # 'n_estimators': [650, 670, 700],
    # 'min_weight_fraction_leaf':[0,0.5],
}
grid_search_rf = GridSearchCV(estimator=RandomForestRegressor(random_state=0),
                              param_grid=param_grid, scoring='neg_mean_squared_error',
                              cv=5)
grid_search_rf.fit(X_train, y_train)
# 模型存储
print(grid_search_rf.best_params_)
rf1 = RandomForestRegressor(bootstrap=True,
                             max_depth=10,
                             max_features='auto',
                             min_samples_leaf=1,
                             min_samples_split=2,
                             n_estimators=20,
                            )
rf1.fit(X_train, y_train)#  对照着y_train作为目标数据训练X_train数据
y_predict1 = rf1.predict(X_test)#  开始预测之前就分好的测试集
error1 = mean_absolute_error(y_test, y_predict1)
print('调参后，平均绝对误差为：', error1)
# result = pd.DataFrame(y_test)
# result['系统参数'] = y_predict0
# result['调参后'] = y_predict1
# print(round(result.head(10)))
end_time=time.time() # 模型结束时间
print("模型运行时间：",end_time-start_time)
# joblib.dump(rf1, 'D:\桌面\model1.pkl')  #  对模型进行保存
# print("模型保存成功")
# clf = joblib.load('D:\桌面\model1.pkl')  #加载已经训练好的模型
# print("模型加载成功")
a=time.time()
x=pd.read_csv('D:\桌面\我的实验\研究区.csv',encoding='gbk')# 读取要预测的数据
d=rf1.predict(x)#  重新调用训练好的模型对进行测试集进行运算
# pd.set_option('display.max_columns',None)#  输出所有的列数
pd.set_option('display.max_rows',None)# 输出所有行数
result = pd.DataFrame()
result['调参后'] = d
print(round(result.head(960)))
b=time.time()
print("调用模型运行时间：",b-a)

训练模型使用的数据（由于22个自变量太多了，没有截到最后面的其他自变量和人口因变量）：