sklearn决策树

import pandas as pd
import numpy as np
 
from sklearn import tree   # 导入模型
from sklearn.model_selection import train_test_split  # 制作数据集和测试集
from sklearn.preprocessing import LabelEncoder 
import graphviz 
from sklearn.metrics import roc_curve, auc 
 
import matplotlib.pyplot as plt
plt.rcParams['font.size'] = 24

data_file=pd.read_csv('123_data.csv',encoding='gbk')
data_file=data_file.dropna(axis=0)
data_file=data_file.drop(['Name','平均年利润率','Value.销售金额','Value.采购金额'],axis=1)
label_file=pd.read_csv('123label.csv',encoding='gbk')
label_file=label_file.dropna(axis=0)

data_file=np.array(data_file.values)
# mean = data_file.mean(axis=0)
# std = data_file.std(axis=0)
# data_file=(data_file-mean)/std
max_value=data_file.max(axis=0)
min_value=data_file.min(axis=0)
data_file=(data_file-min_value)/(max_value-min_value)

x_train_orig, x_test_orig,y_train_orig, y_test_orig = train_test_split(data_file, label_file['信誉评级'], test_size=0.15)
# print(x_train_orig.head(6))
# print(y_train_orig.head(6))
le = LabelEncoder()
y_train_orig = le.fit_transform(y_train_orig)
y_test_orig = le.fit_transform( y_test_orig)

auc_test = []
m=0
k=0
for i in range(100):
    clf = tree.DecisionTreeClassifier(class_weight='balanced', max_depth = i + 1,
                                        splitter ='random',
                                        criterion='entropy',
                                        )   
    clf = clf.fit(x_train_orig, y_train_orig)   # 决策树拟合,得到模型
    score = clf.score(x_test_orig, y_test_orig)  
    print("score=",score) 
    if score>=m:
        m=score
        k=i+1
    y_test_proba = clf.predict_proba(x_test_orig) 
    # print(y_test_proba,y_test_orig)
 
print(m,'k=',k)
predict_file=pd.read_csv('302_data.csv',encoding='gbk')
predict_file.drop(['企业规模指标','企业平均年利润增长率','Value.负向金额总和','Name'],axis=1)
predict_file=predict_file.loc[:,['销项SUM','进项SUM','企业规模指标','企业平均利润','企业平均年利润率','企业产品退货率','企业上下游影响力']]

# print(predict_file.head(6))
clf = tree.DecisionTreeClassifier(class_weight='balanced', max_depth = k)   
clf = clf.fit(x_train_orig, y_train_orig) 
y_test_proba = clf.predict_proba(x_test_orig)

predict_file=np.array(predict_file.values)
mean = predict_file.mean(axis=0)
std = predict_file.std(axis=0)
predict_file=(predict_file-mean)/std

y_pre_proba = clf.predict_proba(predict_file)
ans=np.array(y_pre_proba).argmax(axis=1)
print(ans)

pred_res=[]
ani=['A','B','C','D']
for i in range(124,426,1):
    name='E'+str(i)
    ind=i-124
    pred_res.append({'name':name,'信誉等级':ani[ans[ind]]})
    
# print(pred_res)    
ans_csv=pd.DataFrame(pred_res)
ans_csv.to_csv('302_ans.csv',encoding='gbk')
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值