给出AllElectronics.csv文件,其原始训练数据如下:
RID | age | income | student | credit_rating | class_buy_computer |
1 | youth | high | no | fair | no |
2 | youth | high | no | excellent | no |
3 | middle_aged | high | no | fair | yes |
4 | senior | medium | no | fair | yes |
5 | senior | low | yes | fair | yes |
6 | senior | low | yes | excellent | no |
7 | middle_aged | low | yes | excellent | yes |
8 | youth | medium | no | fair | no |
9 | youth | low | yes | fair | yes |
10 | senior | medium | yes | fair | yes |
11 | youth | medium | yes | excellent | yes |
12 | middle_aged | medium | no | excellent | yes |
13 | middle_aged | high | yes | fair | yes |
14 | senior | medium | no | excellent | no |
通过scikit-learn实现决策树,对上述数据进行训练,并进行预测。
模板如下:
from sklearn.feature_extraction import DictVectorizer
import csv
from sklearn import preprocessing
from sklearn import tree
from sklearn.externals.six import StringIO
from __builtin__ import str
from _csv import reader
from docutils.nodes import header
#open the csv file
allElectronicsData=open(r'E:\eclipse-jee-neon-3-win32\workspace\DeepLearningBasicsMachineLearning\DecisionTree\data\AllElectronics.csv','rb')
reader=csv.reader(allElectronicsData)
headers=reader.next()
print(headers)
featureList=[]
labelList=[]
for row in reader:
labelList.append(row[len(row)-1])
rowDict={}
for i in range(1,len(row)-1):
rowDict[headers[i]]=row[i]
featureList.append(rowDict)
print(featureList)
vec=DictVectorizer()
dummyX=vec.fit_transform(featureList).toarray()
print("dummyX:"+str(dummyX))
print(vec.get_feature_names())
print("labelList:"+str(labelList))
lb=preprocessing.LabelBinarizer()
dummyY=lb.fit_transform(labelList)
print("dummyY:"+str(dummyY))
clf=tree.DecisionTreeClassifier(criterion='entropy')
clf=clf.fit(dummyX, dummyY)
print("clf:"+str(clf))
with open("AllElectronics.dot",'w')as f:
f=tree.export_graphviz(clf, feature_names=vec.get_feature_names(), out_file=f)
#predict
oneRowX=dummyX[0,:]
print("oneRowX:"+str(oneRowX))
newRowX=oneRowX
newRowX[0]=1
newRowX[2]=0
print("newRowX:"+str(newRowX))
predictedY=clf.predict([newRowX])
print("predictedY:"+str(predictedY))