Phishing website analyse(一)
决策树方法分析
- 数据集有1万多条记录,30个特征属性,1个结果属性,属性意义可以在下载数据的网页找到描述文件
![在这里插入图片描述](https://i-blog.csdnimg.cn/blog_migrate/d53f110e93bdf9cbd58de5b1d325f6d0.png)
代码实现
生成决策树模型
import numpy as np
import pandas as pd
origin_data = pd.read_csv('phishing.csv')
data = origin_data.drop('id',axis=1)
train = data[0:int(data.shape[0]*0.8)]
test = data[int(data.shape[0]*0.8):]
train_set_x_orig = np.array(train[train.columns[0:-1]])
train_set_y_orig = np.array(train.Result)
test_set_x_orig = np.array(test[test.columns[0:-1]])
test_set_y_orig = np.array(test.Result)
classes = np.array([-1,1])
train_set_y_orig = train_set_y_orig.reshape((1, train_set_y_orig.shape[0]))
test_set_y_orig = test_set_y_orig.reshape((1, test_set_y_orig.shape[0]))
def calEnt(dataSet):
n = dataSet.shape[0]
iset = dataSet.iloc[:,-1].value_counts()
p = iset/n
ent = (-p*np.log2(p)).sum()
return ent
def bestSplit(dataSet):
baseEnt = calEnt(dataSet)
bestGain = 0
axis = -1
for i in range(1,dataSet.shape[1]-1):
levels= dataSet.iloc[:,i].value_counts().index
ents = 0
for j in levels:
childSet = dataSet[dataSet.iloc[:,i]==j]
ent = calEnt(childSet)
ents += (childSet.shape[0]/dataSet.shape[0])*ent
infoGain = baseEnt-ents
if (infoGain > bestGain):
bestGain = infoGain
axis = i
return axis
def mySplit(dataSet,axis,value):
col = dataSet.columns[axis]
redataSet = dataSet.loc[dataSet[col]==value,:].drop(col,axis=1)
return redataSet
def createTree(dataSet):
featlist = list(dataSet.columns)
classlist = dataSet.iloc[:, -1].value_counts()
if dataSet.shape[1] == 1:
return classlist.index[0]
axis = bestSplit(dataSet)
bestfeat = featlist[axis]
myTree = {bestfeat: {}}
del featlist[axis]
valuelist = set(dataSet.iloc[:, axis])
for value in valuelist:
print(value)
myTree[bestfeat][value] = createTree(mySplit(dataSet, axis, value))
return myTree
myTree = createTree(train)
np.save('./myTree.npy',myTree)
准确率
import numpy as np
import pandas as pd
read_myTree = np.load('myTree.npy',allow_pickle=True).item()
read_myTree
origin_data = pd.read_csv('phishing.csv')
data = origin_data.drop('id',axis=1)
train = data[0:int(data.shape[0]*0.8)]
test = data[int(data.shape[0]*0.8):]
classLabel = -1
def classify(inputTree, labels, testVec):
global classLabel
firstStr = next(iter(inputTree))
secondDict = inputTree[firstStr]
featIndex = labels.index(firstStr)
for key in secondDict.keys():
if testVec[featIndex] == key:
if type(secondDict[key]) == dict:
classLabel = classify(secondDict[key], labels, testVec)
else:
classLabel = secondDict[key]
return classLabel
def acc_classify(inputTree,train,test):
labels = list(train.columns)
result = []
for i in range(test.shape[0]):
testVec = test.iloc[i]
classLabel = classify(inputTree,labels,testVec)
result.append(classLabel)
test['predict']=result
acc = (test.iloc[:,-1]==test.iloc[:,-2]).mean()
print(f'模型预测准确率为{acc}')
return test
acc_classify(read_myTree,train,test)
- 结果:
![在这里插入图片描述](https://i-blog.csdnimg.cn/blog_migrate/fbee2b2492f313d29fbb8c805b49a42a.png)
- 得到的pdf图为:
![](https://i-blog.csdnimg.cn/blog_migrate/d9277d2a0c50e46852097a4cf0c7c1e5.png)
没有经济基础不要随便在自己电脑上跑,消耗电脑资源,浪费时间;如果有服务器的,可以试一试