一、标准分布的随机生成
1.1.1通过生成正态分布随机数画直方图
import numpy as np
import matplotlib.pyplot as plt
np.random.seed(0)
mean = 0
theta = 1
#1.numpy.random.normal(mu, theta, num)
series = np.random.normal(mean, theta,1000)
#2.plt.hist()将数组seires count成30个等宽的区间,纵轴显示每个区间的概率密度
#[param]bins: The edges of the bins
_, bins, _ = plt.hist(series, 30, density=True) #[param]rbins=30,it defines the 30 equal-width bins in the range.
plt.plot(bins, 1/(theta * np.sqrt(2 * np.pi)) * np.exp( -(bins-mean)**2 / (2*theta**2)),
linewidth=2, color='r')
plt.show()
1.1.2 指数分布仿真
import numpy as np
import matplotlib.pyplot as plt
##指数分布仿真
lamda = 0.5
x = np.arange(0,20,0.1)
y = lamda * np.exp(-lamda*x)
plt.plot(x, y, 'r')
plt.show()
1.1.3 伽马分布仿真
- 数学知识:若一段时间[0,t]内事件A发生的 次数的概率P(x=k) 服从参数为lamda的泊松分布,则两次事件发生的**时间间隔的概率P(Sk-Sk-1 > t)服从参数为lamda的指数分布,n次(a次)事件发生的时间间隔的概率P(Sn+1-S1 > t)**服从伽马分布
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as st
##伽马分布仿真
fig = plt.figure(figsize=(18, 6))
ax1 = fig.add_subplot(1,2,1) #param: nrow,ncol,index
ax2 = fig.add_subplot(1,2,2)
x = np.arange(0.01, 15, 0.1)
# 形状参数alpha>1, =1, <1对比图
z1 = st.gamma.pdf(x, 0.9, scale=2)
z2 = st.gamma.pdf(x, 1, scale=2)
z3 = st.gamma.pdf(x, 2, scale=2)
ax1.plot(x, z1, label='a<1')
ax1.plot(x, z2, label='a=1')
ax1.plot(x, z3, label='a>1')
ax1.legend(loc="best") #绘制图并指定其位置
ax1.set_xlabel('x')
ax1.set_ylabel('P(x)')
ax1.set_title("Gamma Distribution lamda = 2")
# 形状参数alpha>1对比图
y1 = st.gamma.pdf(x, 1.5, scale=2)
y2 = st.gamma.pdf(x, 2, scale=2)
y3 = st.gamma.pdf(x, 2.5, scale=2)
y4 = st.gamma.pdf(x, 3, scale=2)
ax2.plot(x, y1, label='a=1.5')
ax2.plot(x, y2, label='a=2')
ax2.plot(x, y3, label='a=2.5')
ax2.plot(x, y4, label='a=3')
#ax1.legend(loc="best") #绘制图并指定其位置
ax2.set_xlabel('x')
ax2.set_ylabel('P(x)')
ax2.set_title("Gamma Distribution lamda = 2, alpha>1")
plt.show()
1.1.4 泊松分布
#泊松分布仿真
def possion_distribution(x, lamda):
ans = []
for i in x:
sum = 1
for j in range(1,i+1): #range(a,b):[a,b)
sum *= j
ans.append((lamda**i * np.exp(-lamda)/sum))
return np.array(ans)
lamda = 5
x = np.arange(1,50)
y = possion_distribution(x,lamda)
plt.bar(x,y,color='r') #画条形图
plt.show()
#通过生成泊松分布的随机数,画直方图
x = np.random.poisson(lam=lamda, size=10000)
#plt.hist():返回的value默认是每个区间的计数,在density=True情况下,为每个区间的概率;count的类型为BarContainer
value, bins, count = plt.hist(x, bins=15, density=True) #画频数直方图
print("count:",count)
print("value:",value)
plt.plot(bins[0:15], value)
plt.show()
1.1.5 离散随机数分布
import numpy as np
from collections import defaultdict
#It provides a default value for the key that does not exists.
dic = defaultdict(int)
def sample():
u = np.random.random()
if u <= 0.1:
dic["hello"] += 1
elif u <= 0.3:
dic["java"] += 1
elif u <= 0.6:
dic["python"] += 1
else:
dic["scala"] += 1
def sampleNtimes():
for i in range(1000):
sample()
for k,v in dic.items():
print(k,v)
sampleNtimes()
1.2 非标准分布的随机生成
1.2.1逆变换法
import matplotlib.pyplot as plt
import numpy as np
def ITMExp(Lambda = 2, maxCnt = 50000):
random_no = []
standardXaxis = []
standarExp = []
for i in range(maxCnt):
x = np.random.random()
u = - (1/Lambda) * np.log(1-x)
random_no.append(u)
for j in range(1000):
fx = Lambda*np.exp(-Lambda*(j/100)) #画(0,10)的pdf
standardXaxis.append(j)
standarExp.append(fx)
plt.plot(standardXaxis, standarExp, 'r')
plt.show()
plt.hist(random_no, 1000)
plt.show()
ITMExp()
1.2.2 接受-拒绝法
- 接受拒绝法
import matplotlib.pyplot as plt
import numpy as np
k = 2.5
mu = 1.4
sigma = 1.2
size = int(1e+07) #表明这是科学计数法
xaxis = np.arange(-3, 6, 0.1)
data = np.random.normal(mu, sigma, size)
def p(x):
return (0.3*np.exp(-(x-0.3)**2) + 0.7*np.exp(-(x-2)**2/0.3) )
def q(x):
return 1/(sigma*np.sqrt(2*np.pi)) * np.exp(-(x-1.4)**2/(2*sigma**2))
u = np.random.uniform(0, k*q(data))
sample = data[p(data)>=u]
_,bins,_ = plt.hist(sample,800,density=True)
plt.plot(xaxis, k*q(xaxis),'r')
plt.show()
- 1.2.2-2 自适应拒绝接受法
- 画Beta(2, 3)与画logBeta(2, 3)
import matplotlib.pyplot as plt
import numpy as np
def gamma(x):
ans = 1
#注意:不能用range(5),这样会从0开始,导致ans的值永远为0
for i in range(1,x):
ans = ans * i
return ans
def beta(x, m, n):
return (gamma(m+n) / (gamma(m) * gamma(n))) * x**(m-1) * (1-x)**(n-1)
x = np.linspace(0, 1, 1000)
y = beta(x, 2, 3)
plt.plot(x, y, label="Beta(2,3)")
plt.legend(loc="upper right")
plt.title('Beta(2,3)')
plt.show()
y2 = np.log(y)
plt.plot(x, y2, label="Beta(2,3)")
plt.legend(loc="upper right")
plt.title('log(Beta(2,3))')
plt.show()
1.3 随机过程的随机生成
1.3.1马尔科夫过程仿真生成
- 注意:一维数组需要加[]
- plt.step(): 画阶梯图
import numpy as np
import matplotlib.pyplot as plt
#设状态空间Omiga = {1, 2, 3}
#注意:一维数组需要加[]
transition_P = np.array([0.2, 0.3, 0.5, 0.5, 0.1, 0.4, 0.6, 0.2, 0.2]).reshape(3,3)
state_series = []
state_init = 1 # 假设初始状态为1
state_cur = state_init # 当前状态
state_series.append(state_init)
time_N = 100 # 假设研究转移的100步
time_next = 1 # 第一步转移
while time_next < time_N:
u = np.random.random()
Y = 0
for k in range(0, 3):
Y += transition_P[state_cur-1][k]
if u < Y:
state_series.append(int(k+1))
state_cur = k+1
break
time_next += 1
state_series = np.array(state_series)
xAxis = np.arange(0,100,1)
plt.step(xAxis, state_series)
plt.show()
1.4 基于变分自编码器模型(VAE)的数据生成
- VAE有较完备的数学理论,基于变分推断与混合高斯分布理论,使推导更加直观。
- VAE模型借鉴主成分分析(PCA)的降维思想,将PCA中构成主成分的线性变换W及其WT变换还原输入操作,分别用编码器网络和解码器网络来替换,通过编码器,从一个高维的输入图像映射到一个低维的隐变量上,再利用解码器,将低维的隐变量再映射回高维输入图像。
- VAE模型的基本思路是把真实样本通过编码器网络变换成理想的概率分布(典型的位正态分布),然后利用这个概率分布抽样出样本,传递给解码器网络,得到生成样本。当生成样本与真实样本足够接近时,就训练出了一个自编码器模型,同时在自编码器模型上做进一步变分处理。
2 探索性数据分析
2.1 numpy随机数产生n*1矩阵的数组
datas = np.random.normal(3, 5, 100) # (100, )
for i in datas.shape[0]:
....
2.2 协方差与相关系数
协方差:Cov = E[(X-ux)(Y-uy)]
- 协方差越接近于0越表明两个属性值间不具有线性关系,但协方差越大并不表明越相关,因为协方差的定义中没有考虑属性值本身大小的影响(所以下面引出的相关系数才需要除以标准差,以消除量纲的影响)。
相关系数
3 特征提取、特征构建与特征选择
- 特征选择: 将原有特征x1, x2, x3…xn,通过函数映射f(),得到y1, y2, … yn = f(x1, x2, …, xn)
- 特征构建: 从原始特征中挑选或将原有特征进行变形。如x1, x2 … -> x(n+1),x(n+2),x(n+m)
- 特征选择: 从原有的n个特征中选择m个子特征
4 PCA
- PCA其实就是正交约束下的矩阵分解问题
-这里应该错了,Z = (X - u)T * (X-u)
- 方法一:核心算法部分(可能存在错误)
# k个主成分 k具体的值
def eigValPct(eigVals, percentage):
sortArray = np.sort(eigVals) # np.sort()从小到大排序
sortArray = sortArray[-1::-1]
arraySum = sum(sortArray)
tempSum = 0
num = 0
for i in sortArray:
tempSum += i
num += 1
if tempSum >= arraySum * percentage:
return num
def pca(dataMat, percentage = 0.9):
meanVals = np.mean(dataMat, axis = 0)
# 去中心化,即每一维特征减去各自的平均值
meanRemoved = dataMat - meanVals
# np.cov(): 求矩阵协方差
# rowvar : bool, optional
# If `rowvar` is True (default), then each row represents a
# variable, with observations in the columns.
covMat = np.cov(meanRemoved, rowvar = False) # 求散度矩阵(注意,这里没有除1/n 因为对求特征向量无影响)
# np.linalg.eig(): 求给定矩阵的特征值与特征向量
# np.mat(X): 将X解释为矩阵
eigVals, eigVects = np.linalg.eig(np.mat(covMat))
k = eigValPct(eigVals, percentage)
# np.argsort():返回数组值从小到大排序的下标
eigValInd = np.argsort(eigVals)
eigValInd = eigValInd[:-(k+1):-1]
redEigVects = eigVects[:, eigValInd]
lowDDataMat = np.dot(meanRemoved * redEigVects)
reconMat = (lowDDataMat * redEigVects.T) + meanVals
return lowDDataMat, reconMat
- 方法二:
import numpy as np
from sklearn.decomposition import PCA
def pca(X, k):
n_samples, n_features = X.shape # X: n*d
n = n_samples
d = n_features
mean = np.array([np.mean(X[:,i]) for i in range(d)])
norm_X = X - mean
# scatter matrix: S = norm_XT * norm_X = (X - mean)T * (X - mean)
scatter_mat = np.dot(np.transpose(norm_X),norm_X) # scatter_mat: d * d
# np.linalg.eig()产生的特征向量的元素是在同一列
eigVal, eigVec = np.linalg.eig(scatter_mat)
eigVal = np.abs(eigVal)
# eigVal = np.sort(eigVal) 不能直接对特征值进行排序,因为特征矩阵的列向量是与特征值顺序对应的
eigValIndx = np.argsort(eigVal) # 用eigValIndx存放特征值从小到大排序的index
eigValIndx = eigValIndx[:-(k+1):-1] # 逆序,从大到小,取最大的前K个的index
W = eigVec[:,eigValIndx] # 取对应的特征向量 W: d*k
Z = np.dot(norm_X, W) # n*d * (d*k)T = n * k
return Z
X = np.array([[-1, 1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])
print(pca(X,1))
pca = PCA(n_components=1)
pca.fit(X)
print(pca.transform(X))
# 结果不一样的原因:
# sklearn中的PCA是通过svd_flip函数实现的,sklearn对奇异值分解结果进行了一个处理,
# 因为ui*σi*vi=(-ui)*σi*(-vi),也就是u和v同时取反得到的结果是一样的,而这会导致
# 通过PCA降维得到不一样的结果(虽然都是正确的)。具体了解可以看参考文章9或者自己分析
# 一下sklearn中关于PCA的源码。
5 LDA
np.random.random()、np.random.normal()、np.random.randn()、np.random.rand()与np.random.randint()
# np.random.random():在连续区间[0,1)上使用均匀分布采样
# np.random.random((8,2)):表示采样8行2列的矩阵
# np.random.random((8,2) * 5 + 15):表示在连续区间[5,20)上使用均匀分布采样,采用大小为8行2列的矩阵
data1 = np.random.random((8,2)) * 5 + 15 # 直接采样8行2列
data1_1 = np.reshape((np.random.random(16) * 5 + 15),(8,2)) # 采样16个数据,再reshape
# np.random.normal(mu, std): 表示正态分布采用
# np.random.normal(mu, std, num) * k1 + k2: 表示在(mu,sig)=(5,2)的正态分布上采样
# 以下3者等价
data2 = np.random.normal(0,1,(8,2)) * 2 + 5
data2_1 = np.random.normal(0,1,16) * 2 + 5
data2_2 = np.random.normal(5,2,(8,2))
# np.random.randn(n_samples, n_featrues)
n_samples, n_features = 5, 3
rng = np.random.RandomState(0)
X = rng.randn(n_samples,n_features)
# np.random.rand(n_samples, n_featrues)
n_samples, n_features = 5, 3
rng = np.random.RandomState(0)
X = rng.randn(n_samples,n_features)
# numpy.random.randint(low, high=None, size=None, dtype='l')
np.random.randint(5, size=(2, 4)) # => array([[4, 0, 2, 1], [3, 2, 2, 0]])
LDA编程
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
basedir = os.path.dirname(os.path.realpath(__file__))
# 计算均值,要求输入数据为numpy的矩阵格式,行表示样本数,列表示特征,即n*d
def meanX(data):
return np.mean(data, axis=0)
# 计算类内散度矩阵
def getSw(data):
n = np.shape(data)[0]
mu = meanX(data)
Sw = 0
for i in range(0, n):
Sw = Sw + np.transpose(data[i] - mu) * (data[i] - mu) # d*1 * 1*d => d*d
return Sw # d*d
# 计算类间散度矩阵:
def getSb(data1, data2):
# np.vstack: Stack arrays in sequence vertically (row wise)
dataX = np.vstack((data1, data2))
mu1 = meanX(data1)
mu2 = meanX(data2)
mu = meanX(dataX)
Sb = np.transpose(mu1 - mu) * (mu1 - mu) # d*1 * 1*d => d*d
return Sb
def LDA(data1, data2):
Sb = getSb(data1, data2) # 类间散度矩阵
data1 = data1
S1 = getSw(data1)
S2 = getSw(data2)
Sw = S1 + S2 # 类内散度矩阵 d*d
# np.mat().I : 矩阵求逆
eigVal, eigVec = np.linalg.eig(np.mat(Sw).I * Sb)
eigValIdx = np.argsort(eigVal) # 假设lamda为[6,3,5,7] -> [3,0,2,1]
max_lamda = eigVal[eigValIdx[0]]
W = eigVec[eigValIdx[0],:]
return W # 1*d
# 构造数据库
def createDataSet():
data1 = np.mat(np.random.random((8,2))*3+5) #类别A采样区间[5, 8)
data2 = np.mat(np.random.random((8,2))*5+2) #类别B采样区间 [2,7)
return data1, data2
def plotFig(group):
fig = plt.figure()
plt.ylim(0,30)
plt.xlim(0,30)
ax = fig.add_subplot(111)
# 如果无.list(),那么group[0,:]是arrary/matrix类型;而传入的参数需要是list类型
ax.scatter(group[0,:].tolist(), group[1,:].tolist())
plt.show()
plt.savefig(basedir+"/LDAImage.jpg")
if __name__ == "__main__":
x1, x2 = createDataSet() # n行d列
print(x1, x2)
w = LDA(x1,x2) #1*d
print("w:", w)
Z1 = x1 * np.transpose(w)
Z2 = x2 * np.transpose(w)
# np.hstack(): Stack arrays in sequence horizontally (column wise).
# 例如X1=[[1,2],[3,4]] X2=[[5,6],[7,8]] => [[1,3,5,7],[2,4,6,8]]
plotFig( np.hstack((np.transpose(x1), np.transpose(x2))) )
print("Z1:",Z1)
print("Z2:",Z2)
6 EM算法
import numpy as np
import random
def create_sample_data(m, n):
mat_y = np.mat(np.zeros((m, n)))
for i in range(m):
for j in range(n): # 通过产生随机数,每一行表示一次实验结果
mat_y[i, j] = random.randint(0, 1)
return
def em(arr_y, theta, tol, iterator_num):
PI = 0
P = 0
Q = 0
m, n = np.shape(arr_y)
mat_y = arr_y
for i in range(iterator_num):
mu = []
PI = np.copy(theta[0])
P = np.copy(theta[1])
Q = np.copy(theta[2])
for j in range(m):
fenzi = (PI * (P ** mat_y[j]) * ((1-P) ** (1 - mat_y[j])))
fenmu = (PI * (P ** mat_y[j]) * ((1-P) ** (1 - mat_y[j]))) + ((1 - PI) * Q **(mat_y[j]) * (1 - Q) ** (1 - mat_y[j]))
val = fenzi / fenmu
mu.append(val)
# 求PI(i+1)
sum1 = 0.0
for j in range(m):
sum1 += mu[j]
theta[0] = sum1 / m
# 求P(i+1)
sum1 = 0.0
sum2 = 0.0
for j in range(m):
sum1 += mu[j] * mat_y[j]
sum2 += mu[j]
theta[1] = sum1 / sum2
# 求Q(i+1)
sum1 = 0.0
sum2 = 0.0
for j in range(m):
sum1 += (1-mu[j]) * mat_y[j]
sum2 += (1-mu[j])
theta[2] = sum1 / sum2
print("---------------------------")
print(theta)
# 判断是否收敛
if (abs(theta[0] - PI) <= tol and abs(theta[1] - P) <= tol and abs(theta[2] - Q) <= tol):
print("beak")
break
return PI, P, Q
if __name__ == "__main__":
# mat_y = create_data(100, 1)
mat_y = np.array([[1],[1],[0],[1],[0],[0],[1],[0],[1],[1]])
theta = [0.5, 0.6, 0.5]
print(mat_y)
PI, P, Q = em(mat_y, theta, 0.001, 100)
print(PI, P, Q)
7. GMM的EM算法
import numpy as np
def generateData(k, mu, sigma, dataNum):
dataArray = np.zeros(dataNum, dtype=np.float32)
n = len(k)
for i in range(dataNum):
# np.random.random(): return random float in [0, 1)
rand = np.random.random()
Sum = 0
index = 0
# 这一步的意思是说:dataArray[]里的数据,有k1比例来自高斯分布1,
# 有k2比例来自高斯分布2,有k3比例来自高斯分布3,
while(index < n):
Sum += k[index]
if (rand < Sum):
dataArray[i] = np.random.normal(mu[index], sigma[index])
break
else:
index += 1
return dataArray
def normPdf(x, mu, sigma):
return (1 / (np.sqrt(2*np.pi) * sigma)) * np.exp(-(x-mu)**2/(2*sigma**2))
def em(dataArray, k, mu, sigma, step=10):
k = k
mu = mu
sigma =sigma
# 高斯分布个数
n = len(k)
dataNum = len(dataArray)
# 初始化gamma数组
gammaArray = np.zeros((n, dataNum))
for s in range(step):
for i in range(n):
for j in range(dataNum):
# 求P(x[j])
Sum = sum(k[t] * normPdf(dataArray[j], mu[t] ,sigma[t]) for t in range(n))
gammaArray[i][j] = k[i] * normPdf(dataArray[j], mu[i], sigma[i]) / float(Sum)
# 更新mu
for i in range(n):
mu[i] = np.sum(gammaArray[i]*dataArray) / np.sum(gammaArray[i])
# 更新sigma
for i in range(n):
a = np.sum(gammaArray[i] * (dataArray - mu[i])**2)
a2 = np.sum(gammaArray[i])
sigma[i] = np.sqrt(np.sum(gammaArray[i] * (dataArray - mu[i])**2) / np.sum(gammaArray[i]))
# 更新系数k
for i in range(n):
k[i] = np.sum(gammaArray[i]) / dataNum
return k, mu, sigma
if __name__ == "__main__":
k = [0.3, 0.4, 0.3]
mu = [2, 4, 3]
sigma = [1, 1, 4]
dataNum = 5000
# 产生数据
dataArray = generateData(k, mu, sigma, dataNum)
# 参数的初始值
k0 = [0.2, 0.3, 0.4]
mu0 = [1, 2, 2]
sigma0 = [2, 2, 3]
step = 6
k1, mu1, sigma1 = em(dataArray, k0, mu0, sigma0, step)
# 输出参数的值
print("参数实际值:")
print("k:", k)
print("mu:", mu)
print("sigma:", sigma)
# 参数实际值
print("参数实际值:")
print("k1:", k1)
print("mu1:", mu1)
print("sigma1:", sigma1)
8 Metroplis sampling
- 在同一张figure上画多个子图
plt.figure(1)
plt.subplot(121)
plt.plot()
plt.subplot(122)
plt.plot()
import random
from scipy.stats import norm
import matplotlib.pyplot as plt
import numpy as np
def cauthy(theta):
return 1 / (np.pi * (1 + theta**2))
iteration_num = 5000
sigma = 1
thetamin = -30
thetamax = 30
# python生成N个x元素的list的方法
# theta存放每一轮循环的参数
theta = [0.0] * (iteration_num+1)
t = 0
while t < iteration_num:
t += 1
# norm.rvs():正态分布产生随机数,size:Defining number of random variates
# theta_star是个list类型
theta_star = norm.rvs(loc=theta[t-1], scale=sigma, size=1, random_state=None)
alpha = min(1, cauthy(theta_star[0]) / cauthy(theta[t-1]))
u = np.random.uniform(0,1)
if u <= alpha:
theta[t] = theta_star[0]
else:
theta[t] = theta[t-1]
plt.figure(1)
plt.subplot(211)
plt.ylim(thetamin, thetamax)
plt.plot(range(iteration_num+1), theta, 'g-')
plt.subplot(212)
num_bins = 50
plt.hist(theta, num_bins, facecolor='red', alpha=0.5)
plt.xlabel('X')
plt.ylabel('Y')
plt.show()
9 Metroplis-Hasting
from __future__ import division
import numpy as np
import matplotlib.pyplot as plt
mu = 3
sigma = 10
# 转移概率
def q(x):
return np.exp(-(x-mu)**2/(sigma**2))
# 按照转移矩阵Q生成样本
def qsample():
return np.random.normal(mu, sigma)
# 目标分布函数p(x)
def p(x):
return 0.3*np.exp(-(x-0.3)**2 + 0.7*np.exp(-(x-2)**2/0.3))
def mcmcsample(n = 20000):
sample = np.zeros(n)
sample[0] = 0.5 #Initialization
for i in range(n-1):
qs = qsample() # 从转移矩阵Q(x)得到样本xt
u = np.random.uniform(0,1)
alpha_ij = (p(qs)*q(sample[i]) / (p(sample[i] * qs)))
if u < min(alpha_ij, 1):
sample[i+1] = qs
else:
sample[i+1] = sample[i]
return sample
x = np.arange(0, 4, 0.1)
realdata = p(x)
sampledata = mcmcsample()
# lw: linewidth
plt.plot(x, realdata, 'r', lw=1) # 理想数据
plt.plot(x, q(x), 'b', lw=1) # Q(x)转移矩阵的数据
plt.hist(sampledata, bins=x, fc='c', density=True)
plt.show()