import matplotlib.pyplot
as plt
from sklearn
import tree
as te
import pandas
as pd
import numpy
as np
from sklearn.metrics
import confusion_matrix
# 导入数据
def
createDataSet():
dataSet = [[
'youth',
'no',
'no',
1,
'refuse'],
[
'youth',
'no',
'no',
'2',
'refuse'],
[
'youth',
'yes',
'no',
'2',
'agree'],
[
'youth',
'yes',
'yes',
1,
'agree'],
[
'youth',
'no',
'no',
1,
'refuse'],
[
'mid',
'no',
'no',
1,
'refuse'],
[
'mid',
'no',
'no',
'2',
'refuse'],
[
'mid',
'yes',
'yes',
'2',
'agree'],
[
'mid',
'no',
'yes',
'3',
'agree'],
[
'mid',
'no',
'yes',
'3',
'agree'],
[
'elder',
'no',
'yes',
'3',
'agree'],
[
'elder',
'no',
'yes',
'2',
'agree'],
[
'elder',
'yes',
'no',
'2',
'agree'],
[
'elder',
'yes',
'no',
'3',
'agree'],
[
'elder',
'no',
'no',
1,
'refuse'],
]
labels = [
'age',
'working?',
'house?',
'credit_situation']
return dataSet, labels
# 测试代码
def
createDataSet2():
dataSet = [[
1,
'Rain',
'Formal',
'Yes',
'Yes',
'No',
'Walk'],
[
4,
'Snow',
'casual',
'No',
'No',
'Yes',
'Drive'],
#
[
7,
'Good',
'casual',
'No',
'No',
'No',
'Walk'],
[
10,
'Rain',
'Formal',
'Yes',
'Yes',
'No',
'Walk'],
[
5,
'Good',
'casual',
'Yes',
'No',
'Yes',
'Walk'],
[
6,
'Good',
'Formal',
'No',
'No',
'No',
'Drive'],
#
[
8,
'Snow',
'Formal',
'No',
'Yes',
'Yes',
'Drive'],
#
[
1,
'Rain',
'Formal',
'Yes',
'No',
'Yes',
'Walk'],
[
4,
'Snow',
'casual',
'Yes',
'Yes',
'Yes',
'Drive'],
#
[
7,
'Good',
'Formal',
'No',
'No',
'Yes',
'Drive'],
#
[
10,
'Good',
'casual',
'',
'No',
'Yes',
'Drive'],
#
[
10,
'Good',
'casual',
'No',
'Yes',
'Yes',
'Drive'],
#
]
labels = [
'number',
'Weather',
'Cloth',
'Shopping',
'Weekend',
'Temp>90']
return dataSet, labels
def
createDataSet3():
matrixevent_df = pd.read_csv(
"./MatrixEventAndLable/rawTFVector.txt",
sep=
' ',
header =
None)
matrix_arrs = matrixevent_df.as_matrix()
event_count_matrix = np.delete(matrix_arrs, matrix_arrs.shape[
1] -
1,
axis =
1)
label_df = pd.read_csv(
"./MatrixEventAndLable/mlabel.txt",
sep=
' ',
header =
None)
label_arrs = label_df.as_matrix()
#label_arrs = np.delete(label_arrs, label_arrs.shape[1] - 1, axis = 1)
labels = label_arrs[:,
0]
dataSet = np.column_stack((event_count_matrix, labels))
labels_size = event_count_matrix.shape[
1]
labels_ = []
for i
in
range(labels_size):
name =
"E" +
str(i)
labels_.append(name)
dataSet_ = [[]]*dataSet.shape[
0]
for i
in
range(dataSet.shape[
0]):
data_row = []
for j
in
range(dataSet.shape[
1]):
data_row.append(dataSet[i][j])
dataSet_[i] = data_row
return dataSet_, labels_
def
createDataSet4():
matrixevent_df = pd.read_csv(
"./MatrixEventAndLable/rawTFVector.txt",
sep=
' ',
header =
None)
matrix_arrs = matrixevent_df.as_matrix()
event_count_matrix = np.delete(matrix_arrs, matrix_arrs.shape[
1] -
1,
axis =
1)
label_df = pd.read_csv(
"./MatrixEventAndLable/mlabel.txt",
sep=
' ',
header =
None)
label_arrs = label_df.as_matrix()
label_arrs = np.delete(label_arrs, label_arrs.shape[
1] -
1,
axis =
1)
train_size =
int(event_count_matrix.shape[
0]*
2/
3)
dataSet_ = event_count_matrix[:train_size]
labels_ = label_arrs[:train_size]
testdata_ = event_count_matrix[train_size:]
testlabels_ = label_arrs[train_size:]
return dataSet_, labels_, testdata_, testlabels_
if
__name__ ==
"__main__":
# myDat, labels = createDataSet3()
# myTree = tree.createTree(myDat, labels)
## when do createDataSet3, we will got following myTree
#myTree = {'E25': {0: 1, 3: {'E19': {0: {'E26': {0: {'E12': {0: {'E17': {0: {'E27': {0: {'E0': {0: 0, 2: 1}}, 3: 1}}, 1: {'E3': {0: 1, 2: 1, 3: 1, 4: 1, 6: 0}}, 2: 1}}, 3: 1}}, 1: {'E3': {0: 1, 1: 1, 2: 1, 3: 1, 4: 1, 16: 0, 15: 0}}, 2: 1}}, 1: {'E2': {0: 1, 1: 1, 2: 1, 3: 1, 4: 1, 5: 1, 6: 1, 7: 1, 8: {'E3': {0: 1, 3: 1, 4: {'E26': {0: 0, 1: 1}}, 5: 1}}, 10: 1, 203: 1}}, 2: 1, 3: 1}}, 4: {'E20': {0: {'E4': {3: 1, 4: 0}}, 1: {'E26': {0: {'E12': {0: 0, 3: 1}}, 1: 1}}, 3: {'E3': {0: 1, 16: 1, 2: 1, 3:1, 4: 1, 5: 1, 6: 1, 17: 1, 25: 0, 18: 1, 15: 0}}, 4: {'E3': {0: {'E2': {0: 1, 1: 1, 203: 0}}, 1: {'E12': {0: 0, 3: 1}}, 2: {'E22': {3: 1, 4: 0}}, 3: {'E2': {1: {'E4': {4: 0, 7: 1}}, 3: 1}}, 4: 1, 5: 1, 6: 1, 41: 0, 15: 1, 16: {'E22': {3: 1, 4: 0}}, 17: {'E22': {3: 1, 4: 0}}, 19: 1}}}}, 5: {'E20': {1: {'E17': {1: {'E26': {0: {'E12': {0: 0, 3: 1}}, 1: 1}}, 2: 1}}, 2: 1, 4: {'E3': {0: {'E22': {3: 1, 4: {'E19': {0: {'E4': {4: 0, 7: 1}}, 1: 1, 2: 1, 3: 1}}}}, 16: 1, 2: {'E22': {3: 1, 4: 0}}, 3: 1, 4: 1, 17: {'E22': {3: 1, 4: 0}}, 23: 0, 18: 0, 15: 0}}, 5: {'E19': {0: 1, 1: 1, 2: {'E26': {0: 0, 1: 1}}}}}}, 6: {'E1': {0: 1, 1: 1, 2: 1, 3: 0}}, 7: 1, 8: 1, 9: {'E26': {0: {'E0': {0: 0, 2: 1}}, 1: 1}}, 10: {'E2': {0: 1, 203: 0}}, 11: {'E15': {2: 1, 3: 1, 4: 0, 5: {'E6': {0: 0, 1: 1}}}}, 12: {'E1': {0: 0, 1: 1}}, 13: 1}}
#print(myTree)
#createPlot(myTree)
traindata, trainlabels, testdata, testlabels = createDataSet4()
clf = te.DecisionTreeClassifier()
clf = clf.fit(traindata, trainlabels)
# install graphviz first (https://graphviz.gitlab.io/_pages/Download/Download_source.html)
te.export_graphviz(clf,
out_file=
"tree.dot") # dot -Tpng tree.dot -o tree.png
prediction =
list(clf.predict(testdata))
assert
len(prediction) ==
len(testlabels)
tn, fp, fn, tp = confusion_matrix(testlabels, prediction).ravel()
print(
"TP:
%d
FP:
%d
FN:
%d
TN:
%d
" %(tp, fp, fn, tn))
P = tp/(tp+fp)
R = tp/(tp+fn)
F1_SCORE =
2*(P*R)/(P+R)
print(
"P:
%f
R:
%f
F1_SCORE:
%f
" %(P,R,F1_SCORE))