数据链接和代码:链接:https://pan.baidu.com/s/19Rj_kP2iJ0szS6l2IWg6FQ
提取码:ezbd
1、数据分析
数据集divorce.xlsx,我们先来看一下数据说明。简单来说,每一个维度对应一个调查问卷的问题。如图
需要引入的库:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns # 用这个库绘制数据分布特征图比较方便,pip install seaborn 就行了
%matplotlib inline
plt.rcParams['font.sans-serif'] = ['KaiTi'] # 指定默认字体
from sklearn.model_selection import cross_val_score # k折交叉验证
from sklearn.model_selection import train_test_split # 导入自动生成训练集和测试集的模块
from sklearn.metrics import classification_report # 导入预测结果评估模块
from sklearn.linear_model import LogisticRegression # 逻辑回归
from sklearn.metrics import confusion_matrix # 混淆矩阵
from sklearn.ensemble import RandomForestClassifier # 随机森林分类
from sklearn.tree import DecisionTreeClassifier # 决策树
from sklearn.neighbors import KNeighborsClassifier # KNN
from sklearn import svm # 支持向量机
读入数据
data_train = pd.read_excel('divorce.xlsx')
# 让pandas自己先告诉我们一些信息,发现没有缺失值,因此不用进行缺失值的填充
data_train.info()
print(data_train)
data_train.describe()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 170 entries, 0 to 169
Data columns (total 55 columns):
Atr1 170 non-null int64
Atr2 170 non-null int64
Atr3 170 non-null int64
Atr4 170 non-null int64
Atr5 170 non-null int64
Atr6 170 non-null int64
Atr7 170 non-null int64
Atr8 170 non-null int64
Atr9 170 non-null int64
Atr10 170 non-null int64
Atr11 170 non-null int64
Atr12 170 non-null int64
Atr13 170 non-null int64
Atr14 170 non-null int64
Atr15 170 non-null int64
Atr16 170 non-null int64
Atr17 170 non-null int64
Atr18 170 non-null int64
Atr19 170 non-null int64
Atr20 170 non-null int64
Atr21 170 non-null int64
Atr22 170 non-null int64
Atr23 170 non-null int64
Atr24 170 non-null int64
Atr25 170 non-null int64
Atr26 170 non-null int64
Atr27 170 non-null int64
Atr28 170 non-null int64
Atr29 170 non-null int64
Atr30 170 non-null int64
Atr31 170 non-null int64
Atr32 170 non-null int64
Atr33 170 non-null int64
Atr34 170 non-null int64
Atr35 170 non-null int64
Atr36 170 non-null int64
Atr37 170 non-null int64
Atr38 170 non-null int64
Atr39 170 non-null int64
Atr40 170 non-null int64
Atr41 170 non-null int64
Atr42 170 non-null int64
Atr43 170 non-null int64
Atr44 170 non-null int64
Atr45 170 non-null int64
Atr46 170 non-null int64
Atr47 170 non-null int64
Atr48 170 non-null int64
Atr49 170 non-null int64
Atr50 170 non-null int64
Atr51 170 non-null int64
Atr52 170 non-null int64
Atr53 170 non-null int64
Atr54 170 non-null int64
Class 170 non-null int64
dtypes: int64(55)
memory usage: 73.2 KB
Atr1 Atr2 Atr3 Atr4 Atr5 Atr6 Atr7 Atr8 Atr9 Atr10 ... Atr46 \
0 2 2 4 1 0 0 0 0 0 0 ... 2
1 4 4 4 4 4 0 0 4 4 4 ... 2
2 2 2 2 2 1 3 2 1 1 2 ... 3
3 3 2 3 2 3 3 3 3 3 3 ... 2
4 2 2 1 1 1 1 0 0 0 0 ... 2
.. ... ... ... ... ... ... ... ... ... ... ... ...
165 0 0 0 0 0 0 0 0 0 0 ... 1
166 0 0 0 0 0 0 0 0 0 0 ... 4
167 1 1 0 0 0 0 0 0 0 1 ... 3
168 0 0 0 0 0 0 0 0 0 0 ... 3
169 0 0 0 0 0 0 0 1 0 0 ... 3
Atr47 Atr48 Atr49 Atr50 Atr51 Atr52 Atr53 Atr54 Class
0 1 3 3 3 2 3 2 1 1
1 2 3 4 4 4 4 2 2 1
2 2 3 1 1 1 2 2 2 1
3 2 3 3 3 3 2 2 2 1
4 1 2 3 2 2 2 1 0 1
.. ... ... ... ... ... ... ... ... ...
165 0 4 1 1 4 2 2 2 0
166 1 2 2 2 2 3 2 2 0
167 0 2 0 1 1 3 0 0 0
168 3 2 2 3 2 4 3 1 0
169 4 4 0 1 3 3 3 1 0
[170 rows x 55 columns]
Out[23]:
Atr1 | Atr2 |
---|