实例:Autoencoder与聚类结合在预测用户偏好中的应用
上篇博文我们用tensorflow实现了Autoencoder
用tensorflow构建一个两层的auto-encoder
本文具体数据集与源代码可从我的GitHub地址获取
https://github.com/liuzuoping/Deep_Learning_note
项目背景:
- 电商数据,包含30w条数据,一共150种商品品类,1w个会员。特征只有用户的购买记录
项目需求:
- 希望根据用户的偏好进行分群,便于以后推荐和预测。
项目步骤
数据预处理
- 将数据全部转换为一个30w*150维度的矩阵,变成特征矩阵(此处为部分数据做演示)
import pandas as pd
import numpy as np
import time
#加载数据文件(你可以加载自己的文件,文件格式如上所示)
x=pd.read_table('test.txt',sep = "\t")
#去除NULL值
x.dropna()
a1=list(x.iloc[:,0])
a2=list(x.iloc[:,1])
a3=list(x.iloc[:,2])
#A是商品类别
dicta=dict(zip(a2,zip(a1,a3)))
A=list(dicta.keys())
#B是用户id
B=list(set(a1))
#创建商品类别字典
a = np.arange(len(A))
lista = list(a)
dict_class = dict(zip(A,lista))
#print dict_class
f=open('class.txt','w')
for k ,v in dict_class.items():
f.write(str(k)+'\t'+str(v)+'\n')
f.close()
#计算运行时间
start=time.clock()
#创建大字典存储数据
dictall = {}
for i in xrange(len(a1)):
if a1[i] in dictall.keys():
value = dictall[a1[i]]
j = dict_class[a2[i]]
value[j] = a3[i]
dictall[a1[i]]=value
else:
value = list(np.zeros(len(A)))
j = dict_class[a2[i]]
value[j] = a3[i]
dictall[a1[i]]=value
#将字典转化为dataframe
dictall1 = pd.DataFrame(dictall)
dictall_matrix = dictall1.T
print(dictall_matrix)
end = time.clock()
print("赋值过程运行时间是:%f s"%(end-start))
用Autoencoder进行降维
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import preprocessing
class AutoEncoder():
""" Auto Encoder
layer 1 2 ... ... L-1 L
W 0 1 ... ... L-2
B 0 1 ... ... L-2
Z 0 1 ... L-3 L-2
A 0 1 ... L-3 L-2
"""
def __init__(self, X, Y, nNodes):
# training samples
self.X = X
self.Y = Y
# number of samples
self.M = len(self.X)
# layers of networks
self.nLayers = len(nNodes)
# nodes at layers
self.nNodes = nNodes
# parameters of networks
self.W = list()
self.B = list()
self.dW = list()
self.dB = list()
self.A = list()
self.Z = list()
self.delta = list()
for iLayer in range(self.nLayers - 1):
self.W.append( np.random.rand(nNodes[iLayer]*nNodes[iLayer+1]).reshape(nNodes[iLayer],nNodes[iLayer+1]) )
self.B.append( np.random.rand(nNodes[iLayer+1]) )
self.dW.append( np.zeros([nNodes[iLayer], nNodes[iLayer+1]]) )
self.dB.append( np.zeros(nNodes[iLayer+1]) )
self.A.append( np.zeros(nNodes[iLayer+1]) )
self.Z.append( np.zeros(nNodes[iLayer+1]) )
self.delta.append( np.zeros(nNodes[iLayer+1]) )
# value of cost function
self.Jw = 0.0
# active function (logistic function)
self.sigmod = lambda z: 1.0 / (1.0 + np.exp(-z))
# learning rate 1.2
self.alpha = 2.5
# steps of iteration 30000
self.steps = 10000
def BackPropAlgorithm(self):
# clear values
self.Jw -= self.Jw
for iLayer in range(self.nLayers-1):
self.dW[iLayer] -= self.dW[iLayer]
self.dB[iLayer] -= self.dB[iLayer]
# propagation (iteration over M samples)
for i in range(self.M):
# Forward propagation
for iLayer in range(self.nLayers - 1):
if iLayer==0: # first layer
self.Z[iLayer] = np.dot(self.X[i], self.W[iLayer])
else:
self.Z[iLayer] = np.dot(self.A[iLayer-1], self.W[iLayer])
self.A[iLayer] = self.sigmod(self.Z[iLayer] + self.B[iLayer])
# Back propagation
for iLayer in range(self.nLayers - 1)[::-1]: # reserve
if iLayer==self.nLayers-2:# last layer
self.delta[iLayer] = -(self.X[i] - self.A[iLayer]) * (self.A[iLayer]*(1-self.A[iLayer]))
self.Jw += np.dot(self.Y[i] - self.A[iLayer], self.Y[i] - self.A[iLayer])/self.M
else:
self.delta[iLayer] = np.dot(self.W[iLayer].T, self.delta[iLayer+1]) * (self.A[iLayer]*(1-self.A[iLayer]))
# calculate dW and dB
if iLayer==0:
self.dW[iLayer] += self.X[i][:, np.newaxis] * self.delta[iLayer][:, np.newaxis].T
else:
self.dW[iLayer] += self.A[iLayer-1][:, np.newaxis] * self.delta[iLayer][:, np.newaxis].T
self.dB[iLayer] += self.delta[iLayer]
# update
for iLayer in range(self.nLayers-1):
self.W[iLayer] -= (self.alpha/self.M)*self.dW[iLayer]
self.B[iLayer] -= (self.alpha/self.M)*self.dB[iLayer]
def PlainAutoEncoder(self):
for i in range(self.steps):
self.BackPropAlgorithm()
print("step:%d" % i, "Jw=%f" % self.Jw)
def ValidateAutoEncoder(self):
for i in range(self.M):
print(self.X[i])
for iLayer in range(self.nLayers - 1):
if iLayer==0: # input layer
self.Z[iLayer] = np.dot(self.X[i], self.W[iLayer])
else:
self.Z[iLayer] = np.dot(self.A[iLayer-1], self.W[iLayer])
self.A[iLayer] = self.sigmod(self.Z[iLayer] + self.B[iLayer])
print("\t layer=%d" % iLayer, self.A[iLayer])
data=[]
index=[]
f=open('./data_matrix.txt','r')
for line in f.readlines():
ss=line.replace('\n','').split('\t')
index.append(ss[0])
ss1=ss[1].split(' ')
tmp=[]
for i in range(len(ss1)):
tmp.append(float(ss1[i]))
data.append(tmp)
f.close()
x = np.array(data)
#print x
#归一化处理
xx = preprocessing.scale(x)
nNodes = np.array([ 10, 5, 10])
ae3 = AutoEncoder(xx,xx,nNodes)
ae3.PlainAutoEncoder()
ae3.ValidateAutoEncoder()