import pandas as pd
import os
from sklearn.preprocessing import MinMaxScaler
import numpy as np
import lightgbm as lgb
base_path = '/home/yby/home/fenbushi/data/'
files_name = os.listdir(base_path)
years = ['1999年','2000年','2001年','2002年','2003年','2004年','2005年','2006年','2007年']
label1 = ['北京','天津','上海','广州','重庆']
label2 = label1+['成都','武汉','郑州','西安']
all_train_data = pd.DataFrame()
for year in years:
train_data = pd.DataFrame(pd.read_excel(base_path+files_name[1],header=3,index_col=0)[year]).rename(
columns={year:'jumin_chuxu'})[:36].reset_index()
#if year=='1999':
# all_train_data=train_data
for file_name in files_name[1:]:
colum = file_name[9:11]
tmp_data = pd.DataFrame(pd.read_excel(base_path+file_name,header=3,index_col=0)[year]).rename(
columns={year:colum})[:36].reset_index()
train_data[colum]=tmp_data[colum]
train_data['label'] = 0
if year=='1999' or year=='2000':
for i in label1:
train_data.loc[train_data['地区']==i,'label']=1
else:
for i in label2:
train_data.loc[train_data['地区']==i,'label']=1
all_train_data = pd.concat([all_train_data,train_data],ignore_index=True)
all_train_data.drop(axis=1,columns=['地区'],inplace=True)
all_train_data = all_train_data.rename(columns={'旅客':'lvke','生产':'produce','财收':'money','人口':'population'})
max_min_scaler = lambda x : (x-np.min(x))/(np.max(x)-np.min(x))
all_columns = all_train_data.columns
for i in all_columns:
all_train_data[i] = all_train_data[[i]].apply(max_min_scaler)
year = '2014年'
test_data = pd.DataFrame(pd.read_excel(base_path+files_name[1],header=3,index_col=0)[year]).rename(
columns={year:'jumin_chuxu'})[:36].reset_index()
#if year=='1999':
# all_train_data=train_data
for file_name in files_name[1:]:
colum = file_name[9:11]
tmp_data = pd.DataFrame(pd.read_excel(base_path+file_name,header=3,index_col=0)[year]).rename(
columns={year:colum})[:36].reset_index()
test_data[colum]=tmp_data[colum]
test_data.drop(axis=1,columns=['地区'],inplace=True)
test_data = test_data.rename(columns={'旅客':'lvke','生产':'produce','财收':'money','人口':'population'})
all_features = test_data.columns
for i in all_features:
test_data[i] = test_data[[i]].apply(max_min_scaler)
index = all_train_data.sample(n=60,random_state=1).index
valid = all_train_data.sample(n=60,random_state=1)
X_valid = valid[all_features]
Y_valid = valid['label']
train = all_train_data.drop(axis=0,index=index,).reset_index()
train = train.drop(axis=1,columns=['index'])
X_train = train[all_features]
Y_train = train['label']
test = test_data
params = {
'boosting_type': 'gbdt',
'objective': 'binary',
'metric': 'binary_logloss',
'num_leaves': 28,
'learning_rate': 0.01,
'feature_fraction': 0.8,
'bagging_fraction': 0.9,
'bagging_seed': 0,
'bagging_freq': 1,
'verbose': 1,
# 'reg_alpha':1,
# 'reg_lambda':2,
'min_child_weight': 6
}
import lightgbm as lgb
lgb_train = lgb.Dataset(X_train, Y_train, )
lgb_evals = lgb.Dataset(X_valid, Y_valid, reference=lgb_train, )
gbm_1 = lgb.train(params,
lgb_train,
num_boost_round=100000,
valid_sets=[lgb_train, lgb_evals],
valid_names=['train', 'valid'],
early_stopping_rounds=3000,
verbose_eval=4000, )
valid_predict_browsed = gbm_1.predict(X_valid)
gbm_browsed = lgb.train(params,
lgb_train,
num_boost_round=gbm_1.best_iteration,
valid_sets=[lgb_train],
valid_names=['train'],
verbose_eval=4000, )
test_predict_browsed = gbm_browsed.predict(test)
# 特征重要性
features = X_train.columns
feature_rank = gbm_browsed.feature_importance()
features_df = pd.DataFrame({'column': features, 'importance': feature_rank}).sort_values(by='importance',
ascending=False)
print(features_df)
预测结果: