import pandas as pd
import numpy as np
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import graphviz
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt
plt.rcParams['font.size'] = 24
data_file=pd.read_csv('123_data.csv',encoding='gbk')
data_file=data_file.dropna(axis=0)
data_file=data_file.drop(['Name','平均年利润率','Value.销售金额','Value.采购金额'],axis=1)
label_file=pd.read_csv('123label.csv',encoding='gbk')
label_file=label_file.dropna(axis=0)
data_file=np.array(data_file.values)
max_value=data_file.max(axis=0)
min_value=data_file.min(axis=0)
data_file=(data_file-min_value)/(max_value-min_value)
x_train_orig, x_test_orig,y_train_orig, y_test_orig = train_test_split(data_file, label_file['信誉评级'], test_size=0.15)
le = LabelEncoder()
y_train_orig = le.fit_transform(y_train_orig)
y_test_orig = le.fit_transform( y_test_orig)
auc_test = []
m=0
k=0
for i in range(100):
clf = tree.DecisionTreeClassifier(class_weight='balanced', max_depth = i + 1,
splitter ='random',
criterion='entropy',
)
clf = clf.fit(x_train_orig, y_train_orig)
score = clf.score(x_test_orig, y_test_orig)
print("score=",score)
if score>=m:
m=score
k=i+1
y_test_proba = clf.predict_proba(x_test_orig)
print(m,'k=',k)
predict_file=pd.read_csv('302_data.csv',encoding='gbk')
predict_file.drop(['企业规模指标','企业平均年利润增长率','Value.负向金额总和','Name'],axis=1)
predict_file=predict_file.loc[:,['销项SUM','进项SUM','企业规模指标','企业平均利润','企业平均年利润率','企业产品退货率','企业上下游影响力']]
clf = tree.DecisionTreeClassifier(class_weight='balanced', max_depth = k)
clf = clf.fit(x_train_orig, y_train_orig)
y_test_proba = clf.predict_proba(x_test_orig)
predict_file=np.array(predict_file.values)
mean = predict_file.mean(axis=0)
std = predict_file.std(axis=0)
predict_file=(predict_file-mean)/std
y_pre_proba = clf.predict_proba(predict_file)
ans=np.array(y_pre_proba).argmax(axis=1)
print(ans)
pred_res=[]
ani=['A','B','C','D']
for i in range(124,426,1):
name='E'+str(i)
ind=i-124
pred_res.append({'name':name,'信誉等级':ani[ans[ind]]})
ans_csv=pd.DataFrame(pred_res)
ans_csv.to_csv('302_ans.csv',encoding='gbk')