机器学习初使用
1 回归
1.1 线性回归 - 最小二乘法拟合
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import Latex,display
print("-"*100)
x = np.array([1,2,3,4,5,6,7,8,9])
y = np.array([0.199,0.389,0.5,0.783,0.980,1.777,1.38,1.575,1.771])
X = np.vstack([x, np.ones(len(x))]).T
print(f'''
X.shape={X.shape}
X={X}
''')
print(f'''
y.shape={y.shape}
y={y}
''')
print("-"*100)
P = np.linalg.lstsq(X, y, rcond=None)[0] # 最小二乘法
print(f'P.shape={P.shape}')
plt.plot(x, y, 'o', label='original data')
plt.plot(x, P[0] * x + P[1], 'r', label='fitted line')
plt.show()
----------------------------------------------------------------------------------------------------
X.shape=(9, 2)
X=[[1. 1.]
[2. 1.]
[3. 1.]
[4. 1.]
[5. 1.]
[6. 1.]
[7. 1.]
[8. 1.]
[9. 1.]]
y.shape=(9,)
y=[0.199 0.389 0.5 0.783 0.98 1.777 1.38 1.575 1.771]
----------------------------------------------------------------------------------------------------
P.shape=(2,)
1.2 非线性拟合 - 指数转log,变线性
左图使用线性最小二乘法,右图使用非线性最小二乘法
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import Latex,display
from scipy.optimize import curve_fit
print("-"*100)
t = np.array([1960,1961,1962,1963,1964,1965,1966,1967,1968])
s = np.array([29.72,30.61,31.51,32.13,32.34,32.85,33.56,34.20,34.83])
x = np.array(t)
y = np.log(np.array(s)) # ln
X = np.vstack([x, np.ones(len(x))]).T
print(f'''
X.shape={X.shape}
X={X}
''')
print(f'''
y.shape={y.shape}
y={y}
''')
print("-"*100)
P = np.linalg.lstsq(X, y, rcond=None)[0] # 线性最小二乘法
plt.subplot(1,2,1)
plt.title('(a,b)=(%f,%f)'%(P[0],P[1]))
plt.plot(t, s, 'o', label='original data')
plt.plot(t, np.exp(P[1])*np.exp(P[0]*t), 'r', label='fitted line')
def func(x,c0,c1):
return c0+c1*x
popt, pcov = curve_fit(func, x, y)
plt.subplot(1,2,2)
plt.title('(a,b)=(%f,%f)'%(popt[1],popt[0]))
plt.plot(x, y, 'o', label='original data')
plt.plot(x, func(x,*popt), 'r', label='fitted line')
plt.show()
----------------------------------------------------------------------------------------------------
X.shape=(9, 2)
X=[[1.960e+03 1.000e+00]
[1.961e+03 1.000e+00]
[1.962e+03 1.000e+00]
[1.963e+03 1.000e+00]
[1.964e+03 1.000e+00]
[1.965e+03 1.000e+00]
[1.966e+03 1.000e+00]
[1.967e+03 1.000e+00]
[1.968e+03 1.000e+00]]
y.shape=(9,)
y=[3.39182022 3.42132675 3.45030496 3.46979017 3.47630485 3.49195174
3.51333488 3.53222564 3.55047908]
----------------------------------------------------------------------------------------------------
2 聚类
2.1 各种聚类算法
K-Means算法
层次聚类算法:AgglomerativeClustering
密度聚类算法:DBSCAN
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import Latex,display
from sklearn.cluster import KMeans,AgglomerativeClustering,DBSCAN
# 显示中文
def conf_zh(font_name):
from pylab import mpl
mpl.rcParams['font.sans-serif'] = [font_name]
mpl.rcParams['axes.unicode_minus'] = False
conf_zh('SimHei')
x = []
f = open('city.txt',encoding='utf-8')
for v in f:
#print(v)
if not v.startswith('城'):
continue
else:
x.append([float(v.split(':')[2].split(' ')[0]), float(v.split(':')[3].split(';')[0])])
x_y = np.array(x)
print(f'''
x_y={x_y}
''')
n_clusters = 8 # 分类数,也就是K值
markers = [['^','b'],['x','c'],['o','g'],['*','k'],['+','m'],['s','r'],['d','y'],['v','k']]
pltsub = [5,3]
f_index = 1
plt.subplot(pltsub[0],pltsub[1],f_index)
f_index += 1
plt.subplots_adjust(top=4,right=3,hspace=0.3,wspace=0.3)
plt.title('K-Means')
cls = KMeans(n_clusters).fit(x_y)
# test 布尔数组
members = cls.labels_ == 3
print(f'members={members}')
for i in range(n_clusters):
members = cls.labels_ == i
# 利用布尔索引,只抓取True的出来展示
plt.scatter(x_y[members,0],x_y[members,1],s=60,marker=markers[i][0], c=markers[i][1], alpha=0.5)
plt.xlabel('经度')
plt.ylabel('纬度')
plt.subplot(pltsub[0],pltsub[1],f_index)
f_index += 1
plt.title('层次聚类:AgglomerativeClustering - ward - 簇内方差最小化')
cls = AgglomerativeClustering(linkage='ward',n_clusters=n_clusters).fit(x)
for i in range(n_clusters):
members = cls.labels_ == i
# 利用布尔索引,只抓取True的出来展示
plt.scatter(x_y[members,0],x_y[members,1],s=60,marker=markers[i][0], c=markers[i][1], alpha=0.5)
plt.xlabel('经度')
plt.ylabel('纬度')
plt.subplot(pltsub[0],pltsub[1],f_index)
f_index += 1
plt.title('层次聚类:AgglomerativeClustering - complete - 簇间距离最大值最小化')
cls = AgglomerativeClustering(linkage='complete',n_clusters=n_clusters).fit(x)
for i in range(n_clusters):
members = cls.labels_ == i
# 利用布尔索引,只抓取True的出来展示
plt.scatter(x_y[members,0],x_y[members,1],s=60,marker=markers[i][0], c=markers[i][1], alpha=0.5)
plt.xlabel('经度')
plt.ylabel('纬度')
plt.subplot(pltsub[0],pltsub[1],f_index)
f_index += 1
plt.title('层次聚类:AgglomerativeClustering - average - 簇间平均距离最小化')
cls = AgglomerativeClustering(linkage='average',n_clusters=n_clusters).fit(x)
for i in range(n_clusters):
members = cls.labels_ == i
# 利用布尔索引,只抓取True的出来展示
plt.scatter(x_y[members,0],x_y[members,1],s=60,marker=markers[i][0], c=markers[i][1], alpha=0.5)
plt.xlabel('经度')
plt.ylabel('纬度')
plt.subplot(pltsub[0],pltsub[1],f_index)
f_index += 1
plt.title('层次聚类:AgglomerativeClustering - single - 簇聚在一起')
cls = AgglomerativeClustering(linkage='single',n_clusters=n_clusters).fit(x)
for i in range(n_clusters):
members = cls.labels_ == i
# 利用布尔索引,只抓取True的出来展示
plt.scatter(x_y[members,0],x_y[members,1],s=60,marker=markers[i][0], c=markers[i][1], alpha=0.5)
plt.xlabel('经度')
plt.ylabel('纬度')
plt.subplot(pltsub[0],pltsub[1],f_index)
f_index += 1
plt.title('密度聚类:DBSCAN')
cls = DBSCAN(eps=2.5,min_samples=1).fit(x)
n_clusters_dbscan = len(set(cls.labels_))
print(f'clusters={n_clusters_dbscan}')
for i in range(n_clusters_dbscan):
members = cls.labels_ == i
# 利用布尔索引,只抓取True的出来展示
plt.scatter(x_y[members,0],x_y[members,1],s=60,marker=markers[i][0], c=markers[i][1], alpha=0.5)
plt.xlabel('经度')
plt.ylabel(</