import numpy as np
引用B站:https://www.bilibili.com/video/BV1Uh411m74w?from=search&seid=4466513966431555719
学自:【B站博主:萌弟AI】
1、简单理解KNN
x = np.array([[1.0,1.1],[1.0,1.0],[0,0],[0,0.1]])
y = np.array([0,0,1,1])
x,y
(array([[1. , 1.1],
[1. , 1. ],
[0. , 0. ],
[0. , 0.1]]), array([0, 0, 1, 1]))
_x = np.array([1. , 1.1])
for i in x:
print(i)
print("---------")
[1. 1.1]
---------
[1. 1.]
---------
[0. 0.]
---------
[0. 0.1]
---------
temp = _x - x
temp
array([[0. , 0. ],
[0. , 0.1],
[1. , 1.1],
[1. , 1. ]])
temp = np.power(temp,2)
temp
array([[0. , 0. ],
[0. , 0.01],
[1. , 1.21],
[1. , 1. ]])
temp = temp.sum(axis=1)
temp
array([0. , 0.01, 2.21, 2. ])
temp = np.sqrt(temp)
temp
array([0. , 0.1 , 1.48660687, 1.41421356])
argsort = temp.argsort()
argsort
array([0, 1, 3, 2], dtype=int64)
y
array([0, 0, 1, 1])
result = y[argsort][:3]
result
array([0, 0, 1])
np.bincount(result).argmax()
0
def knn(_x , k=3):
temp = _x - x
temp = np.power(temp,2) # 向量平方
temp = temp.sum(axis=1) # 向量求和
temp = np.sqrt(temp) # 向量开根号
argsort = temp.argsort() # 排序
result = y[argsort][:k]
return np.bincount(result).argmax()
for i in x:
print(knn(i))
0
0
1
1
2、KNN简单实战用法
#加载数据
def load_data():
with open('约会数据.txt') as fr:
lines = fr.readlines()
x = np.empty((len(lines), 3), dtype=float)
y = np.empty(len(lines), dtype=int)
for i in range(len(lines)):
line = lines[i].strip().split('\t')
x[i] = line[:3]
y[i] = line[3]
return x, y
x, y = load_data()
x, y
(array([[4.0920000e+04, 8.3269760e+00, 9.5395200e-01],
[1.4488000e+04, 7.1534690e+00, 1.6739040e+00],
[2.6052000e+04, 1.4418710e+00, 8.0512400e-01],
...,
[2.6575000e+04, 1.0650102e+01, 8.6662700e-01],
[4.8111000e+04, 9.1345280e+00, 7.2804500e-01],
[4.3757000e+04, 7.8826010e+00, 1.3324460e+00]]),
array([3, 2, 1, 1, 1, 1, 3, 3, 1, 3, 1, 1, 2, 1, 1, 1, 1, 1, 2, 3, 2, 1,
2, 3, 2, 3, 2, 3, 2, 1, 3, 1, 3, 1, 2, 1, 1, 2, 3, 3, 1, 2, 3, 3,
3, 1, 1, 1, 1, 2, 2, 1, 3, 2, 2, 2, 2, 3, 1, 2, 1, 2, 2, 2, 2, 2,
3, 2, 3, 1, 2, 3, 2, 2, 1, 3, 1, 1, 3, 3, 1, 2, 3, 1, 3, 1, 2, 2,
1, 1, 3, 3, 1, 2, 1, 3, 3, 2, 1, 1, 3, 1, 2, 3, 3, 2, 3, 3, 1, 2,
3, 2, 1, 3, 1, 2, 1, 1, 2, 3, 2, 3, 2, 3, 2, 1, 3, 3, 3, 1, 3, 2,
2, 3, 1, 3, 3, 3, 1, 3, 1, 1, 3, 3, 2, 3, 3, 1, 2, 3, 2, 2, 3, 3,
3, 1, 2, 2, 1, 1, 3, 2, 3, 3, 1, 2, 1, 3, 1, 2, 3, 2, 3, 1, 1, 1,
3, 2, 3, 1, 3, 2, 1, 3, 2, 2, 3, 2, 3, 2, 1, 1, 3, 1, 3, 2, 2, 2,
3, 2, 2, 1, 2, 2, 3, 1, 3, 3, 2, 1, 1, 1, 2, 1, 3, 3, 3, 3, 2, 1,
1, 1, 2, 3, 2, 1, 3, 1, 3, 2, 2, 3, 1, 3, 1, 1, 2, 1, 2, 2, 1, 3,
1, 3, 2, 3, 1, 2, 3, 1, 1, 1, 1, 2, 3, 2, 2, 3, 1, 2, 1, 1, 1, 3,
3, 2, 1, 1, 1, 2, 2, 3, 1, 1, 1, 2, 1, 1, 2, 1, 1, 1, 2, 2, 3, 2,
3, 3, 3, 3, 1, 2, 3, 1, 1, 1, 3, 1, 3, 2, 2, 1, 3, 1, 3, 2, 2, 1,
2, 2, 3, 1, 3, 2, 1, 1, 3, 3, 2, 3, 3, 2, 3, 1, 3, 1, 3, 3, 1, 3,
2, 1, 3, 1, 3, 2, 1, 2, 2, 1, 3, 1, 1, 3, 3, 2, 2, 3, 1, 2, 3, 3,
2, 2, 1, 1, 1, 1, 3, 2, 1, 1, 3, 2, 1, 1, 3, 3, 3, 2, 3, 2, 1, 1,
1, 1, 1, 3, 2, 2, 1, 2, 1, 3, 2, 1, 3, 2, 1, 3, 1, 1, 3, 3, 3, 3,
2, 1, 1, 2, 1, 3, 3, 2, 1, 2, 3, 2, 1, 2, 2, 2, 1, 1, 3, 1, 1, 2,
3, 1, 1, 2, 3, 1, 3, 1, 1, 2, 2, 1, 2, 2, 2, 3, 1, 1, 1, 3, 1, 3,
1, 3, 3, 1, 1, 1, 3, 2, 3, 3, 2, 2, 1, 1, 1, 2, 1, 2, 2, 3, 3, 3,
1, 1, 3, 3, 2, 3, 3, 2, 3, 3, 3, 2, 3, 3, 1, 2, 3, 2, 1, 1, 1, 1,
3, 3, 3, 3, 2, 1, 1, 1, 1, 3, 1, 1, 2, 1, 1, 2, 3, 2, 1, 2, 2, 2,
3, 2, 1, 3, 2, 3, 2, 3, 2, 1, 1, 2, 3, 1, 3, 3, 3, 1, 2, 1, 2, 2,
1, 2, 2, 2, 2, 2, 3, 2, 1, 3, 3, 2, 2, 2, 3, 1, 2, 1, 1, 3, 2, 3,
2, 3, 2, 3, 3, 2, 2, 1, 3, 1, 2, 1, 3, 1, 1, 1, 3, 1, 1, 3, 3, 2,
2, 1, 3, 1, 1, 3, 2, 3, 1, 1, 3, 1, 3, 3, 1, 2, 3, 1, 3, 1, 1, 2,
1, 3, 1, 1, 1, 1, 2, 1, 3, 1, 2, 1, 3, 1, 3, 1, 1, 2, 2, 2, 3, 2,
2, 1, 2, 3, 3, 2, 3, 3, 3, 2, 3, 3, 1, 3, 2, 3, 2, 1, 2, 1, 1, 1,
2, 3, 2, 2, 1, 2, 2, 1, 3, 1, 3, 3, 3, 2, 2, 3, 3, 1, 2, 2, 2, 3,
1, 2, 1, 3, 1, 2, 3, 1, 1, 1, 2, 2, 3, 1, 3, 1, 1, 3, 1, 2, 3, 1,
2, 3, 1, 2, 3, 2, 2, 2, 3, 1, 3, 1, 2, 3, 2, 2, 3, 1, 2, 3, 2, 3,
1, 2, 2, 3, 1, 1, 1, 2, 2, 1, 1, 2, 1, 2, 1, 2, 3, 2, 1, 3, 3, 3,
1, 1, 3, 1, 2, 3, 3, 2, 2, 2, 1, 2, 3, 2, 2, 3, 2, 2, 2, 3, 3, 2,
1, 3, 2, 1, 3, 3, 1, 2, 3, 2, 1, 3, 3, 3, 1, 2, 2, 2, 3, 2, 3, 3,
1, 2, 1, 1, 2, 1, 3, 1, 2, 2, 1, 3, 2, 1, 3, 3, 2, 2, 2, 1, 2, 2,
1, 3, 1, 3, 1, 3, 3, 1, 1, 2, 3, 2, 2, 3, 1, 1, 1, 1, 3, 2, 2, 1,
3, 1, 2, 3, 1, 3, 1, 3, 1, 1, 3, 2, 3, 1, 1, 3, 3, 3, 3, 1, 3, 2,
2, 1, 1, 3, 3, 2, 2, 2, 1, 2, 1, 2, 1, 3, 2, 1, 2, 2, 3, 1, 2, 2,
2, 3, 2, 1, 2, 1, 2, 3, 3, 2, 3, 1, 1, 3, 3, 1, 2, 2, 2, 2, 2, 2,
1, 3, 3, 3, 3, 3, 1, 1, 3, 2, 1, 2, 1, 2, 2, 3, 2, 2, 2, 3, 1, 2,
1, 2, 2, 1, 1, 2, 3, 3, 1, 1, 1, 1, 3, 3, 3, 3, 3, 3, 1, 3, 3, 2,
3, 2, 3, 3, 2, 2, 1, 1, 1, 3, 3, 1, 1, 1, 3, 3, 2, 1, 2, 1, 1, 2,
2, 1, 1, 1, 3, 1, 1, 2, 3, 2, 2, 1, 3, 1, 2, 3, 1, 2, 2, 2, 2, 3,
2, 3, 3, 1, 2, 1, 2, 3, 1, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
3, 2, 2, 2, 2, 2, 1, 3, 3, 3]))
x.min(axis=0)
array([0. , 0. , 0.001156])
x.max(axis=0)
array([9.1273000e+04, 2.0919349e+01, 1.6955170e+00])
(x - x.min(axis=0)) / (x.max(axis=0) - x.min(axis=0))
array([[0.44832535, 0.39805139, 0.56233353],
[0.15873259, 0.34195467, 0.98724416],
[0.28542943, 0.06892523, 0.47449629],
...,
[0.29115949, 0.50910294, 0.51079493],
[0.52711097, 0.43665451, 0.4290048 ],
[0.47940793, 0.3768091 , 0.78571804]])
#数据归一化
def norm():
#这里是按列统计的
x_col_min = x.min(axis=0)
x_col_max = x.max(axis=0)
#这一步之后,最小值是0
x_norm = x - x_col_min
#这一步之后,最大值是1
x_norm /= x_col_max - x_col_min #c /= a 等效于 c = c / a
return x_norm
x = norm()
x
array([[0.44832535, 0.39805139, 0.56233353],
[0.15873259, 0.34195467, 0.98724416],
[0.28542943, 0.06892523, 0.47449629],
...,
[0.29115949, 0.50910294, 0.51079493],
[0.52711097, 0.43665451, 0.4290048 ],
[0.47940793, 0.3768091 , 0.78571804]])
#切分数据集
test_x = x[900:]
test_y = y[900:]
x = x[:900]
y = y[:900]
test_x.shape, test_y.shape, x.shape, y.shape
((100, 3), (100,), (900, 3), (900,))
def knn(_x, k=3):
temp = _x - x
temp = np.power(temp, 2)
temp = temp.sum(axis=1)
temp = np.sqrt(temp)
argsort = temp.argsort()
result = y[argsort][:k]
return np.bincount(result).argmax()
knn(test_x[4], k=11)
2
test_x[0]
array([0.51376639, 0.17031964, 0.26218144])
test_y
array([1, 2, 1, 2, 2, 1, 1, 2, 3, 3, 1, 1, 1, 1, 3, 3, 3, 3, 3, 3, 1, 3,
3, 2, 3, 2, 3, 3, 2, 2, 1, 1, 1, 3, 3, 1, 1, 1, 3, 3, 2, 1, 2, 1,
1, 2, 2, 1, 1, 1, 3, 1, 1, 2, 3, 2, 2, 1, 3, 1, 2, 3, 1, 2, 2, 2,
2, 3, 2, 3, 3, 1, 2, 1, 2, 3, 1, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 3, 2, 2, 2, 2, 2, 1, 3, 3, 3])
correct = 0
for i in range(len(test_x)):
pred = knn(test_x[i], k=5)
print(pred, test_y[i])
if pred == test_y[i]:
correct += 1
print(correct / len(test_x))
1 1
2 2
1 1
2 2
2 2
1 1
1 1
2 2
3 3
1 3
1 1
1 1
1 1
1 1
3 3
3 3
3 3
3 3
2 3
3 3
1 1
3 3
3 3
2 2
3 3
2 2
3 3
3 3
2 2
2 2
1 1
2 1
1 1
3 3
3 3
1 1
1 1
1 1
1 3
3 3
2 2
1 1
2 2
1 1
1 1
2 2
2 2
1 1
1 1
1 1
2 3
1 1
1 1
2 2
3 3
2 2
2 2
1 1
3 3
1 1
2 2
3 3
1 1
2 2
2 2
2 2
2 2
3 3
2 2
3 3
3 3
1 1
2 2
1 1
2 2
3 3
1 1
3 3
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
3 3
2 2
2 2
2 2
2 2
2 2
1 1
3 3
3 3
3 3
0.95
3、极大似然估计
贝叶斯原理取用:https://blog.csdn.net/zengxiantao1994/article/details/72787849
import numpy as np
with open('简单分类数据.txt') as fr:
lines = fr.readlines()
import numpy as np
#加载数据
def load_data():
with open('简单分类数据.txt') as fr:
lines = fr.readlines()
x = np.empty((len(lines), 2), dtype=float) # 构造空的向量空间
y = np.empty(len(lines), dtype=float) # 构造空的向量空间
for i in range(len(lines)):
line = lines[i].strip().split('\t')
x[i] = line[:2]
y[i] = line[2]
return x, y
x, y = load_data()
x[:5], y[:5]
(array([[-0.017612, 14.053064],
[-1.395634, 4.662541],
[-0.752157, 6.53862 ],
[-1.322371, 7.152853],
[ 0.423363, 11.054677]]), array([-1., 1., -1., -1., -1.]))
N, M = x.shape
x.shape
(100, 2)
N
100
N, M = x.shape
w = np.ones(2)
b = 0.0
w
array([1., 1.])
x[:, 0].min()
-3.642001
x[:, 0].max()
3.01015
np.arange(x[:, 0].min(), x[:, 0].max(), 1)
array([-3.642001, -2.642001, -1.642001, -0.642001, 0.357999, 1.357999,
2.357999])
w[1]
1.0
X = np.arange(x[:, 0].min(), x[:, 0].max(), 1)
Y = np.empty([len(X)])
for i in range(len(X)):
Y[i] = (-w[0] * X[i] - b) / w[1]
print(Y)
[ 3.642001 -2.642001 -1.642001 -0.642001 0.357999 1.357999 2.357999]
[ 3.642001 2.642001 -1.642001 -0.642001 0.357999 1.357999 2.357999]
[ 3.642001 2.642001 1.642001 -0.642001 0.357999 1.357999 2.357999]
[3.642001 2.642001 1.642001 0.642001 0.357999 1.357999 2.357999]
[ 3.642001 2.642001 1.642001 0.642001 -0.357999 1.357999 2.357999]
[ 3.642001 2.642001 1.642001 0.642001 -0.357999 -1.357999 2.357999]
[ 3.642001 2.642001 1.642001 0.642001 -0.357999 -1.357999 -2.357999]
import matplotlib.pyplot as plt
#把线画出来,线就是wx + b = 0
def draw():
X = np.arange(x[:, 0].min(), x[:, 0].max(), 1) # x向量中,最大值和最小值之间,以1作为间隔分隔
Y = np.empty([len(X)])
for i in range(len(X)):
Y[i] = (-w[0] * X[i] - b) / w[1]
plt.scatter(x[:, 0], x[:, 1], c=y)
plt.plot(X, Y)
plt.show()
draw()
<Figure size 640x480 with 1 Axes>
draw()
def predict(x):
z = w.dot(x) + b
return 1 / (1 + np.exp(-z))
predict(x[0])
0.999999197434851
def get_loss():
loss = 0
for i in range(N):
if y[i] == 1:
#当y是1时,我希望predict(x[i])越大越好
loss += np.log(predict(x[i]))
if y[i] == -1:
#当y是-1时,我希望1-predict(x[i])越大越好
loss += np.log(1 - predict(x[i]))
#写成书上的公式,这和上面的写法其实是一样的.
loss = 0
for i in range(N):
p = predict(x[i])
d = 1 if y[i] == 1 else 0
loss += d * np.log(p) + (1 - d) * np.log(1 - p)
return loss
get_loss()
-540.4081260783354
#暴力求梯度法
def gradient():
global w
global b
upsilon = 1e-2
l1 = get_loss()
w[0] += upsilon
l2 = get_loss()
w[0] -= upsilon
gradient_w0 = (l2 - l1) / upsilon
w[1] += upsilon
l2 = get_loss()
w[1] -= upsilon
gradient_w1 = (l2 - l1) / upsilon
b += upsilon
l2 = get_loss()
b -= upsilon
gradient_b = (l2 - l1) / upsilon
return np.array([gradient_w0, gradient_w1]), gradient_b
gradient()
(array([ -19.34439272, -510.07213195]), -33.286852351091056)
#训练
for i in range(100):
g_w, g_b = gradient()
w += g_w * 1e-2
b += g_b * 1e-1
if i % 10 == 0:
print(get_loss())
-849.7506395767857
-9.85351358926892
-103.73703332631135
-28.9351102653891
-12.89001666735565
-9.35218572999099
-9.32529120124075
-9.323731446383107
-9.324079875559004
-9.325080903063883
draw()
4、方差与偏差
5、岭回归和lasso回归
压缩估计(正则化):
除了刚刚讨论的直接对特征自身进行选择以外,我们还可以对回归的系数进行约束或者加罚的技巧对p个特征的模型进行拟合,显著降低模型方差,这样也会提高模型的拟合效果。具体来说,就是将回归系数往零的方向压缩,这也就是为什么叫压缩估计的原因了。
- 岭回归(L2正则化的例子):
在线性回归中,我们的损失函数为
J
(
w
)
=
∑
i
=
1
N
(
y
i
−
w
0
−
∑
j
=
1
p
w
j
x
i
j
)
2
J(w) = \sum\limits_{i=1}^{N}(y_i-w_0-\sum\limits_{j=1}^{p}w_jx_{ij})^2
J(w)=i=1∑N(yi−w0−j=1∑pwjxij)2,我们在线性回归的损失函数的基础上添加对系数的约束或者惩罚,即:
J
(
w
)
=
∑
i
=
1
N
(
y
i
−
w
0
−
∑
j
=
1
p
w
j
x
i
j
)
2
+
λ
∑
j
=
1
p
w
j
2
,
其
中
,
λ
≥
0
w
^
=
(
X
T
X
+
λ
I
)
−
1
X
T
Y
J(w) = \sum\limits_{i=1}^{N}(y_i-w_0-\sum\limits_{j=1}^{p}w_jx_{ij})^2 + \lambda\sum\limits_{j=1}^{p}w_j^2,\;\;其中,\lambda \ge 0\\ \hat{w} = (X^TX + \lambda I)^{-1}X^TY
J(w)=i=1∑N(yi−w0−j=1∑pwjxij)2+λj=1∑pwj2,其中,λ≥0w^=(XTX+λI)−1XTY
调节参数
λ
\lambda
λ的大小是影响压缩估计的关键,
λ
\lambda
λ越大,惩罚的力度越大,系数则越趋近于0,反之,选择合适的
λ
\lambda
λ对模型精度来说十分重要。岭回归通过牺牲线性回归的无偏性降低方差,有可能使得模型整体的测试误差较小,提高模型的泛化能力。
- Lasso回归(L1正则化的例子):
岭回归的一个很显著的特点是:将模型的系数往零的方向压缩,但是岭回归的系数只能呢个趋于0但无法等于0,换句话说,就是无法做特征选择。能否使用压缩估计的思想做到像特征最优子集选择那样提取出重要的特征呢?答案是肯定的!我们只需要对岭回归的优化函数做小小的调整就行了,我们使用系数向量的L1范数替换岭回归中的L2范数:
J
(
w
)
=
∑
i
=
1
N
(
y
i
−
w
0
−
∑
j
=
1
p
w
j
x
i
j
)
2
+
λ
∑
j
=
1
p
∣
w
j
∣
,
其
中
,
λ
≥
0
J(w) = \sum\limits_{i=1}^{N}(y_i-w_0-\sum\limits_{j=1}^{p}w_jx_{ij})^2 + \lambda\sum\limits_{j=1}^{p}|w_j|,\;\;其中,\lambda \ge 0
J(w)=i=1∑N(yi−w0−j=1∑pwjxij)2+λj=1∑p∣wj∣,其中,λ≥0
为什么Losso能做到特征选择而岭回归却不能呢个做到呢?(如图:左边为lasso,右边为岭回归)