实验一 决策树
1、数据集
收入 | 信用历史 | 债务 | 结果 |
---|---|---|---|
0_5K | Bad | Low | Reject |
0_5K | Good | Low | Approve |
0_5K | Unknown | High | Reject |
0_5K | Unknown | Low | Approve |
0_5K | Unknown | Low | Approve |
0_5K | Unknown | Low | Reject |
5_10K | Bad | High | Reject |
5_10K | Good | High | Approve |
5_10K | Unknown | High | Approve |
5_10K | Unknown | Low | Approve |
5_10K | Bad | Low | Reject |
5_10K | Good | Low | Approve |
2、代码 |
import operator
from math import log
# 创造示例数据
def createDataSet2():
dataSet = [['0_5K', 'Bad','Low', 'Reject'],
['0_5K', 'Good', 'Low','Approve'],
['0_5K', 'Unknown', 'High','Reject'],
['0_5K', 'Unknown', 'Low','Approve'],
['0_5K', 'Unknown', 'Low','Approve'],
['0_5K', 'Unknown', 'Low','Reject'],
['5_10K', 'Bad', 'High','Reject'],
['5_10K', 'Good', 'High','Approve'],
['5_10K', 'Unknown', 'High','Approve'],
['5_10K', 'Unknown', 'Low','Approve'],
['5_10K', 'Bad', 'Low','Reject'],
['5_10K', 'Good', 'Low','Approve']]
# 三个特征
labels = ['Income','Credit_History','Debt']
return dataSet, labels
# 计算数据的熵(entropy), 只根据最后一列, 即分类结果来计算熵。
def calcShannonEnt(dataSet):
# 熵只和决策列有关, 所以第一个循环先统计决策列有几种决策和相应个数, 第二个循环计算熵。
numEntries = len(dataSet) # 数据条数
labelCounts = {}
for featVec in dataSet: # 下边说的类就是哪种决策
currentLabel = featVec[-1] # 每行数据的最后一个字( 类别)
if currentLabel not in labelCounts.keys():
labelCounts[currentLabel] = 0
labelCounts[currentLabel] += 1 # 统计有多少个类以及每个类的数量
shannonEnt = 0
for key in labelCounts:
prob = float(labelCounts[key]) / numEntries # 计算单个类的熵值
shannonEnt -= prob * log(prob, 2) # 累加每个类的熵值
return shannonEnt
# 按某个特征分类后的数据
def splitDataSet(dataSet, axis, value):
retDataSet = [] # 把dataset集合中dataset[axis] == value 的数据放到retDataSet中, 并且去除了轴axis列数据
for featVec in dataSet:
if featVec[axis] == value:
reducedFeatVec = featVec[: axis]
reducedFeatVec.extend(featVec[axis + 1:])
retDataSet.append(reducedFeatVec)
return retDataSet
# 选择最优的分类特征
def chooseBestFeatureToSplit(dataSet):
numFeatures = len(dataSet[0]) - 1 # 横向, 属性的个数, 不包括最后的决策。
baseEntropy = calcShannonEnt(dataSet) # 计算未分类前的熵, 只根据最后一列进行计算
bestInfoGain = 0
bestFeature = -1
for i in range(numFeatures): # 循环决策前的每一个属性, 挑选信息增益最大的属性, 外层循环
featList = [example[i] for example in dataSet] # 第i 个属性的所有值
uniqueVals = set(featList) # 集合( set) 是一个无序的不重复元素序列, 得到第i 个属性的所有选择
newEntropy = 0
for value in uniqueVals: # 内层循环
# 把dataset集合中dataset[axis = i] ==value 的数据放到subDataSet中, 并且去除了 i 列数据, 所以subDataSet比dataSet少一列
subDataSet = splitDataSet(dataSet, i, value)
prob = len(subDataSet) / float(len(dataSet))
subDataSetEntropy = calcShannonEnt(subDataSet) # 熵, 只根据最后一列进行计算
newEntropy += prob * subDataSetEntropy # 按特征分类后的熵
infoGain = baseEntropy - newEntropy # 原始熵与按特征分类后的熵的差值
if (infoGain > bestInfoGain): # 若按某特征划分后, 熵值减少的最大, 则次特征为最优分类特征
bestInfoGain = infoGain
bestFeature = i
return bestFeature # 返回信息增益最大的属性下标
# 对于majorityCnt 函数, 表示已经到决策树的叶子结点了, 没有办法再细分了。 这时就按照哪个决策多, 就采用哪个决策。
def majorityCnt(classList): # 按分类后类别数量排序,比如:最后分类为2R1A,则判定为R;
classCount = {}
for vote in classList:
if vote not in classCount.keys():
classCount[vote] = 0
classCount[vote] += 1
sortedClassCount = sorted(classCount.items(), key=operator.itemgetter(1), reverse=True)
return sortedClassCount[0][0]
def createTree(dataSet, labels):
classList = [example[-1] for example in dataSet] # 把数据集dataSet的决策列的内容放到classList中
# 所以即使是递归函数, 但最后一列的内容一直存在。 不会随splitDataSet 而丢失最后一列, 即决策列不会丢。
if classList.count(classList[0]) == len(classList): # 若决策列只有一个选项, 就是只有一个决策, 直接返回。
return classList[0]
# 因为createTree函数要调用splitDataSet函数, 使得前边的属性一直减少。 若减少到0, 只有决策列, 就是到叶子结点了, 没法再分时, 就调用majorityCnt 函数, 虽然可能有不同的决策, 这时只是简单根据哪个决策多, 就算那个决策。
if len(dataSet[0]) == 1: # 横向
return majorityCnt(classList)
bestFeat = chooseBestFeatureToSplit(dataSet) # 返回 信息增益最大的属性下标
bestFeatLabel = labels[bestFeat]
myTree = {bestFeatLabel: {}} # 分类结果以字典形式保存
del (labels[bestFeat]) # 删除增益最大的标签
featValues = [example[bestFeat] for example in dataSet] # 把数据集dataSet的第bestFeat 个属性的内容放到featValues中
uniqueVals = set(featValues) # 获得featValues 中所有可能的选项。
for value in uniqueVals: # 根据uniqueVals 中元素的个数, 来决定创建几叉树, 类似于现序创建多叉树。
subLabels = labels[:] # copy labels 标签
subDataSet = splitDataSet(dataSet, bestFeat,value) # 把dataset集合中dataset[bestFeat] == value 的数据放到retDataSet中, 并且去除了bestFeat列数据
myTree[bestFeatLabel][value] = createTree(subDataSet, subLabels)
return myTree
if __name__ == '__main__':
dataSet, labels = createDataSet2() # 创造示列数据
print(createTree(dataSet, labels)) # 输出决策树模型结果
3、实验结果
运行结果:
{'Credit_History': {'Unknown': {'Income': {'0_5K': {'Debt': {'High': 'Reject', 'Low': 'Approve'}}, '5_10K': 'Approve'}}, 'Bad': 'Reject', 'Good': 'Approve'}}
决策树:
实验二 逻辑回归
1、数据集
EntryNo. |Square_Feet| Price
–|–|–|–
1 |150| 6450
2 |200 |7450
3 |250 |8450
4 |300 |9450
5 |350 |11450
6 |400 |15450
7 |600| 18450
2、代码
import numpy
from pandas import read_csv
from matplotlib import pyplot as plt
from sklearn.linear_model import LinearRegression
data = read_csv('mydata.csv')
#画出散点图,求x和y的相关系数
plt.scatter(data.money,data.sales)
print(data.corr())
#估计模型参数,建立回归模型
'''
(1) 首先导入简单线性回归的求解类LinearRegression
(2) 然后使用该类进行建模,得到lrModel的模型变量
'''
lrModel = LinearRegression()
#(3) 接着,我们把自变量和因变量选择出来
x = data[['money']]
y = data[['sales']]
#模型训练
'''
调用模型的fit方法,对模型进行训练
这个训练过程就是参数求解的过程
并对模型进行拟合
'''
lrModel.fit(x,y)
#对回归模型进行检验
print("模型得分:",lrModel.score(x,y))
#利用回归模型进行预测
lrModel.predict([[60],[70]])
#查看截距
alpha = lrModel.intercept_[0]
print("截距为:",alpha)
#查看参数
beta = lrModel.coef_[0][0]
print("参数为:",beta)
result = alpha + beta*numpy.array([60,70])
print("结果为:",result)
3、结果
no money sales
no 1.000000 -0.297891 -0.393672
money -0.297891 1.000000 0.941814
sales -0.393672 0.941814 1.000000
模型得分: 0.8870135716304371
[[150.0667131]
[173.7963006]]
截距为: 7.689188143332572
参数为: 2.3729587493228075
结果为: [150.0667131 173.7963006]
实验三 神经网络
1、代码
'''
神经网络
'''
# -*- coding: UTF-8 -*-
import random
import math
class bp:
def init_w(self,w,x,y):#初始化偏置值
for i in range(x):
for j in range(y):
w[i][j]=random.random()
if(w[i][j]<0.5):w[i][j]=-w[i][j]
def init_se(self,w,x):#初始化权值
for i in range(x):
w[i]=random.random()
if(w[i]<0.5):
w[i]=-w[i]
def forward(self, inp, outp,w, x, y, se):#向前传播输入
for j in range(y):
outp[j]=0
for i in range(x):
outp[j]+=inp[i]*w[i][j]
outp[j] = outp[j]+se[j]
outp[j] = (1.0)/(1+math.exp(-outp[j]))
def reforward(self):#反向误差更新
self.sumse = 0
#计算输出层误差
for i in range(self.o_size):
self.eo[i] = self.ouput[i] * (1.0-self.ouput[i]) * (self.ouputex[i]-self.ouput[i]);
if(self.eo[i]<0.0):
self.sumse -= self.eo[i]
else:
self.sumse += self.eo[i]
#计算输入层误差
for i in range(self.h_size):
self. eh[i] = 0
for j in range(self.o_size):
self. eh[i]+= self.hidden[i] * (1-self.hidden[i]) * self.who[i][j] * self.eo[j];
def updatew(self):
#更新隐含层与输出层权值
for i in range(self.h_size):
for j in range(self.o_size):
self.upwho[i][j]=(self.L*self.hidden[i]*self.eo[j])+(self.Mom*self.upwho[i][j])
self.who[i][j]+=self.upwho[i][j]
#更新输入与隐含层权值
for i in range(self.i_size):
for j in range(self.h_size):
self.upwih[i][j]=(self.L*self.input[i]*self.eh[j])+(self.Mom*self.upwih[i][j])
self.wih[i][j]+=self.upwih[i][j]
#更新阈值
def updatefa(self):
for i in range(self.i_size):
self.seh[i]+=self.L*self.eh[i]
for i in range(self.o_size):
self.seo[i]+=self.L*self.eo[i]
#训练函数
def train(self ,in1,out1):
self.input=in1
self.ouputex=out1
self.forward(self.input, self.hidden, self.wih, self.i_size, self.h_size, self.seh)#向前传播输入
self.forward(self.hidden, self.ouput, self.who, self.h_size, self.o_size, self.seo)
self.reforward()#反向误差传播
self.updatew()#更新网络权重
self.updatefa()#更新阈值
#测试函数
def test(self,init1):
self.input=init1
self.forward(self.input, self.hidden, self.wih, self.i_size, self.h_size, self.seh)
self.forward(self.hidden, self.ouput, self.who, self.h_size, self.o_size, self.seo)
for i in range(self.o_size):
print(self.ouput[i])
#返回阈值
def get_e(self,w,x):
self.f=0
for i in range(self.o_size):
print(w[i],end="")
self.f+=1
if(self.f%2==0):
print("")
print("")
#返回权值
def get_w(self,w,x,y):
self.f=0
for i in range(x):
for j in range(y):
print(w[i][j],end="")
print(" ",end="")
self.f+=1
if(self.f%2==0):
print("")
#类初始化
def __init__(self,size,l,mom):
self.L=l #学习因子
self.Mom=mom; #动量
self.i_size=size[0]; #输入层数量
self.h_size=size[1]; #隐含层数量
self.o_size=size[2]; #输出层数量
self.wih=[[0 for i in range(self.h_size)] for j in range(self.i_size)] #输入层与隐含层权值
self.who=[[0 for i in range(self.o_size)] for j in range(self.h_size)] #隐含层与输出层权值
self.upwih=[[0 for i in range(self.h_size)] for j in range(self.i_size)]
self.upwho=[[0 for i in range(self.o_size)] for j in range(self.h_size)] # 动量更新
self.input=[0 for i in range(self.i_size)] #输入层
self.hidden=[0 for i in range(self.h_size)] #隐含层
self.ouput=[0 for i in range(self.o_size)] #输出层
self.ouputex=[0 for i in range(self.o_size)] #期待输出
self.seh= [0 for i in range(self.h_size)] #隐含层偏置
self.seo= [0 for i in range(self.o_size)] #输出层偏置
self.eh= [0 for i in range(self.h_size)] #隐含层误差
self.eo= [0 for i in range(self.o_size)] #输出层误差
self. init_w(self.wih,self.i_size,self.h_size) #初始化 输入层到隐含层
self. init_w(self.who,self.h_size,self.o_size) #初始化 隐含层到输出层.
self.init_se(self.seh, self.h_size)#初始化隐含层的偏置
self.init_se(self.seo, self.o_size)#初始化输出层的偏置
size =[2,2,1]
inputData = [[0.0,0.0],[0.0,1.0],[1.0,0.0],[1.0,1.0],[0.1,1.0],[0.1,1.0]]
outputData = [[0.0],[1.0],[1.0],[0.0],[1.0],[1.0]]
testData = [[0.05,0.1],[0.2,0.9],[0.86,0.95]]
t=bp(size,0.5,0.9)
for i in range(10000):
for j in range(6):
t.train(inputData[j],outputData[j])
if(t.sumse<0.001):
print("迭代次数:",i)
break
print("误差为:",t.sumse)
print("输入层与隐含层连接权值为:",end="")
t.get_w(t.wih,t.i_size,t.h_size)
print("隐含层与输出层连接权值为:",end="")
t.get_w(t.who,t.h_size,t.o_size)
print("隐含层神经元阈值为:",end="")
t.get_e(t.seh, t.h_size)
print("输出层神经元阈值为:",end="")
t.get_e(t.seo, t.o_size)
for i in range(3):
print("训练样本为:",testData[i]," 结果为:",end="")
t.test(testData[i])
2、实验结果
迭代次数: 161
误差为: 0.000995014678158063
输入层与隐含层连接权值为:6.7051554509091 6.926310016229366
3.8318324717255 -2.6904779540702215
隐含层与输出层连接权值为:5.525762771016369 -5.75168530920581
隐含层神经元阈值为:-0.7568688215726267
输出层神经元阈值为:0.17134156840340653
训练样本为: [0.05, 0.1] 结果为:0.13175326728624578
训练样本为: [0.2, 0.9] 结果为:0.8879985499542564
训练样本为: [0.86, 0.95] 结果为:0.496739493293363
实验四 卷积神经网络
1、代码
import torch
import torch.nn as nn
import torchvision
import torchvision.transforms as transforms
import cv2
from torch.autograd import Variable
# Device configuration
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
# 设置超参数
num_epochs = 5
output_size = 10
batch_size = 100
learning_rate = 0.001
# 加载 MNIST 数据,如果没有下载过,就会在当前路径下新建 /data 子目录,并把文件存放其中
# MNIST 数据是属于 torchvision 包自带的数据,所以可以直接调用。
train_dataset = torchvision.datasets.MNIST(root='./data', #文件存放路径
train=True, #提取训练集
#将图像转化为 Tensor,在加载数据的时候,就可以对图像做预处理
transform=transforms.ToTensor(),
download=True) #当找不到文件的时候,自动下载
# 加载测试数据集
test_dataset = torchvision.datasets.MNIST(root='./data',
train=False,
transform=transforms.ToTensor())
# 数据集加载
train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
batch_size=batch_size,
shuffle=True)
test_loader = torch.utils.data.DataLoader(dataset=test_dataset,
batch_size=batch_size,
shuffle=False)
#2个卷积层的神经网络
class ConvNet(nn.Module):
def __init__(self):
super(ConvNet, self).__init__()
self.layer1 = nn.Sequential(
nn.Conv2d(1, 16, kernel_size=5, stride=1, padding=2), #输入1通道,输出16通道,其实代表卷积核的个数为16
nn.BatchNorm2d(16), #输入1通道,输出16通道,其实代表卷积核的个数为16
nn.ReLU(), #激励函数处理
nn.MaxPool2d(kernel_size=2, stride=2)) #最大池化,降采样 2x2 步长为2
self.layer2 = nn.Sequential(
nn.Conv2d(16, 32, kernel_size=5, stride=1, padding=2),
nn.BatchNorm2d(32),
nn.ReLU(),
nn.MaxPool2d(kernel_size=2, stride=2))
self.fc = nn.Linear(7*7*32, output_size)
def forward(self, x):
out = self.layer1(x)
out = self.layer2(out)
out = out.reshape(out.size(0), -1) #将输出7*7*32拉成一个张量,size(0),返回行数,view(行数,-1),reshape成多少行数,列数模糊控制不管。
out = self.fc(out)
return out
model = ConvNet().to(device)
# 损失函数和优化器
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
#训练模型
total_step = len(train_loader)
for epoch in range(num_epochs):
for i, (images, labels) in enumerate(train_loader):
images = images.to(device)
labels = labels.to(device)
# 前向传播
outputs = model(images)
loss = criterion(outputs, labels)
# 反向传播和优化
optimizer.zero_grad()
loss.backward()
optimizer.step()
if (i+1) % 100 == 0:
print ('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}'
.format(epoch+1, num_epochs, i+1, total_step, loss.item()))
# 测试模型
model.eval() #把模型设置成验证模式
with torch.no_grad():
correct = 0
total = 0
for images, labels in test_loader:
images = images.to(device)
labels = labels.to(device)
outputs = model(images)
_, predicted = torch.max(outputs.data, 1) ##data是一个以两个张量为元素的列表
total += labels.size(0)
correct += (predicted == labels).sum().item()
print('Test Accuracy of the model on the 10000 test images: {} %'.format(100 * correct / total))
# 保存模型
torch.save(model.state_dict(), 'model.pkl')
#
X_test, y_test = next(iter(test_loader))
inputs = Variable(X_test)
inputs = inputs.to(device)
pred = model(inputs)
_, pred = torch.max(pred, 1)
print("Predict Label is:", (i for i in pred))
print("Real Label is :", [i for i in y_test])
img = torchvision.utils.make_grid(X_test)
img = img.numpy().transpose(1, 2, 0)
std = [0.5, 0.5, 0.5]
mean = [0.5, 0.5, 0.5]
img = img * std + mean
cv2.imshow('win', img)
key_pressed = cv2.waitKey(0)
2、实验结果
Test Accuracy of the model on the 10000 test images: 99.09 %
Predict Label is: <generator object <genexpr> at 0x000001CD47A4C8C8>
Real Label is : [tensor(7), tensor(2), tensor(1), tensor(0), tensor(4), tensor(1), tensor(4), tensor(9), tensor(5), tensor(9), tensor(0), tensor(6), tensor(9), tensor(0), tensor(1), tensor(5), tensor(9), tensor(7), tensor(3), tensor(4), tensor(9), tensor(6), tensor(6), tensor(5), tensor(4), tensor(0), tensor(7), tensor(4), tensor(0), tensor(1), tensor(3), tensor(1), tensor(3), tensor(4), tensor(7), tensor(2), tensor(7), tensor(1), tensor(2), tensor(1), tensor(1), tensor(7), tensor(4), tensor(2), tensor(3), tensor(5), tensor(1), tensor(2), tensor(4), tensor(4), tensor(6), tensor(3), tensor(5), tensor(5), tensor(6), tensor(0), tensor(4), tensor(1), tensor(9), tensor(5), tensor(7), tensor(8), tensor(9), tensor(3), tensor(7), tensor(4), tensor(6), tensor(4), tensor(3), tensor(0), tensor(7), tensor(0), tensor(2), tensor(9), tensor(1), tensor(7), tensor(3), tensor(2), tensor(9), tensor(7), tensor(7), tensor(6), tensor(2), tensor(7), tensor(8), tensor(4), tensor(7), tensor(3), tensor(6), tensor(1), tensor(3), tensor(6), tensor(9), tensor(3), tensor(1), tensor(4), tensor(1), tensor(7), tensor(6), tensor(9)]