实训六 回顾与学习
import pandas as pd
df = pd.read_csv("data/adult.data", header=None)
df.head()
| 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 |
---|
0 | 39 | State-gov | 77516 | Bachelors | 13 | Never-married | Adm-clerical | Not-in-family | White | Male | 2174 | 0 | 40 | United-States | <=50K |
---|
1 | 50 | Self-emp-not-inc | 83311 | Bachelors | 13 | Married-civ-spouse | Exec-managerial | Husband | White | Male | 0 | 0 | 13 | United-States | <=50K |
---|
2 | 38 | Private | 215646 | HS-grad | 9 | Divorced | Handlers-cleaners | Not-in-family | White | Male | 0 | 0 | 40 | United-States | <=50K |
---|
3 | 53 | Private | 234721 | 11th | 7 | Married-civ-spouse | Handlers-cleaners | Husband | Black | Male | 0 | 0 | 40 | United-States | <=50K |
---|
4 | 28 | Private | 338409 | Bachelors | 13 | Married-civ-spouse | Prof-specialty | Wife | Black | Female | 0 | 0 | 40 | Cuba | <=50K |
---|
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
0 32561 non-null int64
1 32561 non-null object
2 32561 non-null int64
3 32561 non-null object
4 32561 non-null int64
5 32561 non-null object
6 32561 non-null object
7 32561 non-null object
8 32561 non-null object
9 32561 non-null object
10 32561 non-null int64
11 32561 non-null int64
12 32561 non-null int64
13 32561 non-null object
14 32561 non-null object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB
str_cols=[1,3,5,6,7,8,9,13,14]
for col in str_cols:
df.iloc[:,col]=df.iloc[:,col].map(lambda x: x.strip())
import numpy as np
df.replace("?",np.nan,inplace=True)
df.dropna(inplace=True)
from sklearn.preprocessing import LabelEncoder
label_encoder=[]
encoded_set = np.empty(df.shape)
for col in range(df.shape[1]):
encoder=None
if df.iloc[:,col].dtype==object:
encoder=LabelEncoder()
encoded_set[:,col]=encoder.fit_transform(df.iloc[:,col])
else:
encoded_set[:,col]=df.iloc[:,col]
label_encoder.append(encoder)
data = np.delete(encoded_set, [2,10,11], axis=1)
from sklearn.model_selection import train_test_split
X, y = data[:, :-1], data[:,-1]
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.3, random_state=42)
from sklearn.naive_bayes import GaussianNB
gaussianNB=GaussianNB()
gaussianNB.fit(train_X,train_y)
from sklearn.cross_validation import cross_val_score
num_validations=5
accuracy=cross_val_score(gaussianNB,test_X,test_y,
scoring='accuracy',cv=num_validations)
print('准确率:{:.2f}%'.format(accuracy.mean()*100))
precision=cross_val_score(gaussianNB,test_X,test_y,
scoring='precision_weighted',cv=num_validations)
print('精确度:{:.2f}%'.format(precision.mean()*100))
recall=cross_val_score(gaussianNB,test_X,test_y,
scoring='recall_weighted',cv=num_validations)
print('召回率:{:.2f}%'.format(recall.mean()*100))
f1=cross_val_score(gaussianNB,test_X,test_y,
scoring='f1_weighted',cv=num_validations)
print('F1 值:{:.2f}%'.format(f1.mean()*100))
from sklearn.metrics import confusion_matrix
y_pred=gaussianNB.predict(test_X)
confusion_mat = confusion_matrix(test_y, y_pred)
print(confusion_mat)
from sklearn.metrics import classification_report
target_names = ['<=50K', '>50K']
print(classification_report(test_y, y_pred, target_names=target_names))
准确率:76.41%
精确度:79.53%
召回率:76.41%
F1 值:77.40%
[[5359 1408]
[ 672 1610]]
precision recall f1-score support
<=50K 0.89 0.79 0.84 6767
>50K 0.53 0.71 0.61 2282
avg / total 0.80 0.77 0.78 9049
df[1].values[:100]
array(['State-gov', 'Self-emp-not-inc', 'Private', 'Private', 'Private',
'Private', 'Private', 'Self-emp-not-inc', 'Private', 'Private',
'Private', 'State-gov', 'Private', 'Private', 'Private',
'Self-emp-not-inc', 'Private', 'Private', 'Self-emp-not-inc',
'Private', 'Private', 'Federal-gov', 'Private', 'Private',
'Local-gov', 'Private', 'Private', 'Private', 'Local-gov',
'Private', 'Private', 'Federal-gov', 'State-gov', 'Private',
'Private', 'Private', 'Self-emp-not-inc', 'Private',
'Self-emp-not-inc', 'Private', 'Private', 'Private', 'Federal-gov',
'Private', 'Private', 'State-gov', 'Private', 'Private', 'Private',
'Federal-gov', 'Self-emp-inc', 'Private', 'Private', 'Private',
'Private', 'Private', 'Private', 'Private', 'Private', 'Private',
'Private', 'Private', 'Private', 'Self-emp-inc', 'Private',
'Private', 'Self-emp-not-inc', 'Private', 'Private', 'Private',
'Private', 'Private', 'Local-gov', 'Private', 'Private', 'Private',
'Private', 'Private', 'Private', 'Local-gov', 'Private', 'Private',
'Federal-gov', 'Private', 'Private', 'Private', 'Local-gov',
'Local-gov', 'Self-emp-not-inc', 'Private', 'Private',
'Federal-gov', 'Private', 'Private', 'Self-emp-not-inc', 'Private',
'Private', 'Self-emp-inc', 'Private', 'Local-gov'], dtype=object)
data[10].values[:100]
array([ 2174, 0, 0, 0, 0, 0, 0, 0, 14084,
5178, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 5013, 2407, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 14344, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0,
0], dtype=int64)