文献来源
【Forecasting credit ratings of decarbonized firms: Comparative assessmentof machine learning models】
文章有代码复现有两个基本工作,1.是提取每个算法的重要性;2.计算每个算法的评价指标
算法有 CRT 分类决策树 ANN 人工神经网络 RFE 随机森林 SVM支持向量机
评价指标有F1 Score ;Specificity ;Accuracy
1.准备数据
分类标签【信誉等级CR1-CR2】和特征向量【变量】
[特征-变量]Probability of Default违约概率
Coverages覆盖范围
Capital Structure资本结构
Liquidity流动性
Profitability盈利能力
Operating Efficiency运营效率
Scale, Scope, and Diversity规模、范围和多样性
Competitive Advantage竞争优势
Fiscal Strength and Credit Conditions财政实力和信用状况
Systemic Governance and Effectiveness系统治理和有效性
Economic Strength经济实力
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
# Regenerating the dataset with binary classification
num_companies = 500
features = {
'Probability_of_Default': np.random.uniform(0, 1, num_companies),
'Coverages': np.random.uniform(0, 1, num_companies),
'Capital_Structure': np.random.uniform(0, 1, num_companies),
'Liquidity': np.random.uniform(0, 1, num_companies),
'Profitability': np.random.uniform(0, 1, num_companies),
'Operating_Efficiency': np.random.uniform(0, 1, num_companies),
'Scale_Scope_and_Diversity': np.random.uniform(0, 1, num_companies),
'Competitive_Advantage': np.random.uniform(0, 1, num_companies),
'Fiscal_Strength_and_Credit_Conditions': np.random.uniform(0, 1, num_companies),
'Systemic_Governance_and_Effectiveness': np.random.uniform(0, 1, num_companies),
'Economic_Strength': np.random.uniform(0, 1, num_companies),
}
features['Rating'] = np.random.choice([1, 2], num_companies)
# Create a DataFrame
df = pd.DataFrame(features)
# Preview the data
df.head()
df.to_excel(r'C:\Users\12810\Desktop\组会\组会记录\2024-0103寒假集训材料\看论文任务\要讲的\模拟数据.xlsx')
df.head()
Probability_of_Default | Coverages | Capital_Structure | Liquidity | Profitability | Operating_Efficiency | Scale_Scope_and_Diversity | Competitive_Advantage | Fiscal_Strength_and_Credit_Conditions | Systemic_Governance_and_Effectiveness | Economic_Strength | Rating | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.298531 | 0.328423 | 0.665525 | 0.985169 | 0.107456 | 0.933863 | 0.714293 | 0.681608 | 0.508398 | 0.472675 | 0.508559 | 2 |
1 | 0.841430 | 0.736388 | 0.999020 | 0.689946 | 0.348265 | 0.929935 | 0.066077 | 0.609516 | 0.797929 | 0.048373 | 0.424858 | 2 |
2 | 0.462897 | 0.105783 | 0.716292 | 0.912855 | 0.564482 | 0.850507 | 0.774066 | 0.880007 | 0.737817 | 0.729397 | 0.283405 | 2 |
3 | 0.062724 | 0.073537 | 0.611761 | 0.213703 | 0.483220 | 0.668749 | 0.052895 | 0.924532 | 0.134043 | 0.126261 | 0.910167 | 1 |
4 | 0.242200 | 0.089723 | 0.874793 | 0.659927 | 0.159241 | 0.348462 | 0.828590 | 0.273572 | 0.117796 | 0.154820 | 0.324018 | 2 |
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import train_test_split
# Define features and labels
X = df.drop('Rating', axis=1)
y = df['Rating']
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Initialize the classifiers
classifiers = {
'CRT': DecisionTreeClassifier(random_state=42),
'ANN': MLPClassifier(random_state=42, max_iter=1000),
'RFE': RandomForestClassifier(random_state=42),
'SVM': SVC(random_state=42)
}
# Dictionary to store models' performance metrics
performance_metrics = {}
# Training and evaluating classifiers
for name, clf in classifiers.items():
clf.fit(X_train, y_train) # Train the model
y_pred = clf.predict(X_test) # Predict on test set
# Calculate performance metrics
performance_metrics[name] = classification_report(y_test, y_pred, output_dict=True)
performance_metrics[name]['accuracy'] = accuracy_score(y_test, y_pred)
# Since ANN and SVM don't provide a direct method for feature importance, we will skip that part as per user's instructions.
performance_metrics
# ... (previous code to generate data and split sets)
# Initialize classifiers
decision_tree = DecisionTreeClassifier(random_state=42)
random_forest = RandomForestClassifier(random_state=42)
ann = MLPClassifier(random_state=42, max_iter=1000)
svm = SVC(random_state=42)
# Train and predict using each classifier
for clf in [decision_tree, random_forest, ann, svm]:
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(f"{clf.__class__.__name__} metrics:")
print(classification_report(y_test, y_pred))
print(f"Accuracy: {accuracy_score(y_test, y_pred)}\n")
# Feature importance for Decision Tree and Random Forest
print(f"Decision Tree Feature Importance: {decision_tree.feature_importances_}")
print(f"Random Forest Feature Importance: {random_forest.feature_importances_}")
D:\install_file\Anaconda3\lib\site-packages\sklearn\neural_network\_multilayer_perceptron.py:691: ConvergenceWarning: Stochastic Optimizer: Maximum iterations (1000) reached and the optimization hasn't converged yet.
warnings.warn(
DecisionTreeClassifier metrics:
precision recall f1-score support
1 0.36 0.38 0.37 45
2 0.47 0.45 0.46 55
accuracy 0.42 100
macro avg 0.42 0.42 0.42 100
weighted avg 0.42 0.42 0.42 100
Accuracy: 0.42
RandomForestClassifier metrics:
precision recall f1-score support
1 0.39 0.40 0.40 45
2 0.50 0.49 0.50 55
accuracy 0.45 100
macro avg 0.45 0.45 0.45 100
weighted avg 0.45 0.45 0.45 100
Accuracy: 0.45
MLPClassifier metrics:
precision recall f1-score support
1 0.40 0.44 0.42 45
2 0.50 0.45 0.48 55
accuracy 0.45 100
macro avg 0.45 0.45 0.45 100
weighted avg 0.46 0.45 0.45 100
Accuracy: 0.45
SVC metrics:
precision recall f1-score support
1 0.45 0.42 0.44 45
2 0.55 0.58 0.57 55
accuracy 0.51 100
macro avg 0.50 0.50 0.50 100
weighted avg 0.51 0.51 0.51 100
Accuracy: 0.51
Decision Tree Feature Importance: [0.1373407 0.1034318 0.09762338 0.047583 0.13000749 0.04778061
0.04700612 0.09306039 0.13848067 0.05285945 0.1048264 ]
Random Forest Feature Importance: [0.09758143 0.09809294 0.09033973 0.08525976 0.09331244 0.08281067
0.094648 0.10032612 0.08863182 0.08533643 0.08366067]
D:\install_file\Anaconda3\lib\site-packages\sklearn\neural_network\_multilayer_perceptron.py:691: ConvergenceWarning: Stochastic Optimizer: Maximum iterations (1000) reached and the optimization hasn't converged yet.
warnings.warn(
from sklearn.inspection import permutation_importance# For ANN
ann.fit(X_train, y_train)
perm_importance_ann = permutation_importance(ann, X_test, y_test, n_repeats=30, random_state=42)
# For SVM
svm.fit(X_train, y_train)
perm_importance_svm = permutation_importance(svm, X_test, y_test, n_repeats=30, random_state=42)
# Store the permutation importances in a dictionary or DataFrame
feature_importances_ann = perm_importance_ann.importances_mean
feature_importances_svm = perm_importance_svm.importances_mean
# You can then display these importances or further analyze them
print("Feature importances from ANN:", feature_importances_ann)
print("Feature importances from SVM:", feature_importances_svm)
D:\install_file\Anaconda3\lib\site-packages\sklearn\neural_network\_multilayer_perceptron.py:691: ConvergenceWarning: Stochastic Optimizer: Maximum iterations (1000) reached and the optimization hasn't converged yet.
warnings.warn(
Feature importances from ANN: [-0.04366667 -0.04866667 -0.06166667 -0.01633333 -0.06166667 -0.069
-0.059 -0.07333333 -0.02866667 -0.041 -0.04266667]
Feature importances from SVM: [-0.002 -0.00633333 0.019 0.035 0.00666667 0.00233333
-0.00233333 -0.01466667 -0.02 0.00833333 0.02 ]