本例中用神经网络来实现对搜索关键字点击的预测,在搜索引擎中,用户输入关键字后,引擎会给出搜索结果列表,而后用户会点击其中一个网页。在用户不断的点击行为中,神经网络会不断学习,给用户提供更好的推荐和结果排序。
本文的代码实现一个多层感知机网络,第一层接受输入,第二层是隐含层,第三次是输出层。应用反向传播算法(BP)对神经网络进行训练。关于神经网络具体的原理此处不赘述,请参看权威书籍或博客。
# -*- coding: utf-8 -*-
__author__ = 'Bai Chenjia'
import sys
from sqlite3 import dbapi2 as sqlite
from math import *
reload(sys)
sys.setdefaultencoding('utf8')
"""
基于反向误差传播的神经网络
"""
class searchnet:
def __init__(self, dbname):
self.con = sqlite.connect(dbname)
def __del__(self):
self.con.close()
# 存储神经网络的结构信息.
# hiddennode存储隐含层节点,wordhidden和hiddenurl分别存储输入层-隐含层和隐含层-输出层的权值
def maketables(self):
self.con.execute('create table hiddennode(create_key)')
self.con.execute('create table wordhidden(fromid, toid, strength)')
self.con.execute('create table hiddenurl(fromid, toid, strength)')
self.con.commit()
# 该方法通过给定层数layer(0 or 1)和 fromid,toid,返回该条边连接的权值
# 如果不存在该连接,若为 输入层-隐含层,返回-0.2,若为 隐含层-输出层,返回0
def getstrength(self, fromid, toid, layer):
if layer == 0:
table = "wordhidden"
else:
table = "hiddenurl"
res = self.con.execute('select strength from %s where fromid=%d and toid=%d' % (table, fromid, toid)).fetchone()
# 如果没有该边的记录,则返回默认值
if res == None:
if layer == 0:
return -0.2
if layer == 1:
return 0.5
return res[0]
# 该方法通过给定层数layer(0 or 1),fromid,toid,和权值strength,更新指定边的权值
# 如果该边不存在则新建,如果该边存在则更新
def setstrength(self, fromid, toid, layer, strength):
if layer == 0:
table = 'wordhidden'
else:
table = 'hiddenurl'
# 首先检索是否存在该边的权值,检索行号
res = self.con.execute('select rowid from %s where fromid=%d and toid=%d' %
(table, fromid, toid)).fetchone()
# 如果不存在则插入记录
if res is None:
self.con.execute('insert into %s (fromid, toid, strength) values (%d, %d, %d)' %
(table, fromid, toid, strength))
# 如果存在则更新权值
else:
rowid = res[0]
self.con.execute('update %s set strength=%d where rowid=%d' % (table, strength, rowid))
# 该方法创建隐藏节点. 构建的神经网络并非开始便创建所有节点,而是在需要时建立新的隐藏节点
# 每传入一组以前从未见过的单词组合,该函数就会在隐藏层中建立一个新的节点
# 随后,函数会为单词与隐藏节点之间,以及查询节点与URL节点之间建立起具有默认权重的连接(setstrength函数)
def generatehiddennode(self, wordids, urls):
# 只为三个检索词以下的短文检索建立隐藏节点
if len(wordids) > 3:
return None
createkey = '_'.join(sorted([str(wi) for wi in wordids]))
# 检索是否含有该隐含层节点
res = self.con.execute(
"select rowid from hiddennode where create_key='%s'" % createkey).fetchone()
# 如果没有则新建
if res is None:
cursor = self.con.execute(
"insert into hiddennode (create_key) values ('%s')" % createkey)
# 获取最后插入行的主键
hiddenid = cursor.lastrowid
# 设置 输入层-隐含层 的默认权重
for wordid in wordids:
self.setstrength(wordid, hiddenid, 0, 1.0 / len(wordids))
# 设置 隐含层-输出层 的默认权重
for url in urls:
self.setstrength(hiddenid, url, 1, 0.1)
self.con.commit()
# 输入一组训练数据,即(检索词,URL地址),从隐含层中找出与该项查询相连接的隐含层节点编号
def getallhiddenids(self, wordids, urlids):
l1 = {}
# 查询与检索词(输入层)相连接的隐含层节点
for wordid in wordids:
cursor1 = self.con.execute('select toid from wordhidden where fromid=%d' % wordid)
for row in cursor1:
l1[row[0]] = 1
for urlid in urlids:
cursor2 = self.con.execute('select fromid from hiddenurl where toid=%d' % urlid)
for row in cursor2:
l1[row[0]] = 1
return l1.keys()
# 利用数据库中保存的信息建立包含所有当前权重值在内的相应网络
# 包括 单词列表、隐含层节点、输出节点. 每个节点的输出值. 边的权值矩阵
def setupnetwork(self, wordids, urlids):
# 单词列表,隐含层列表,输出层列表
self.wordids = wordids
self.hiddenids = self.getallhiddenids(wordids, urlids)
self.urlids = urlids
# 输入层,隐含层,输出层 每个节点的输出值
self.ai = [1.0] * len(self.wordids)
self.ah = [1.0] * len(self.hiddenids)
self.ao = [1.0] * len(self.urlids)
# 建立 输入层-隐含层 和 隐含层-输出层 权重矩阵
print "**", self.hiddenids[:]
self.wi = [[self.getstrength(wordid, hiddenid, 0) for hiddenid in self.hiddenids] for wordid in self.wordids]
self.wo = [[self.getstrength(wordid, hiddenid, 1) for urlid in self.urlids] for hiddenid in self.hiddenids]
# 前馈算法:算法接收一列输入将其推入网络,然后返回所有输出层节点的输出结果
def feedforward(self):
# 输入节点为要查询的单词
for i in range(len(self.wordids)):
self.ai[i] = 1.0
# 由输入层节点和权值矩阵计算隐含层节点值
for j in range(len(self.hiddenids)):
sum = 0.0
for i in range(len(self.wordids)):
sum += self.ai[i] * self.wi[i][j]
self.ah[j] = tanh(sum)
# 由隐含层节点值和权值矩阵计算输出层节点
for k in range(len(self.urlids)):
sum = 0.0
for j in range(len(self.hiddenids)):
sum += self.ah[j] * self.wo[j][k]
self.ao[k] = tanh(sum)
# 返回输出层节点的输出值(代表每个url被点击的可能性)
print "神经网络前馈算法输出为", self.ao[:]
return self.ao[:]
# 初始化网络,运行前馈算法,产生输出值
def getresult(self, wordids, urlids):
self.setupnetwork(wordids, urlids)
return self.feedforward()
# S型函数的导数
def dtanh(self, y):
return 1.0-y*y
# 反向误差传播算法,修正权值
# 分别计算输出层和隐含层的误差,根据误差修正两层之间的权值。N为学习率
def backPropagate(self, targets, N=0.5):
# 计算输出层的误差
output_deltas = [0.0] * len(self.urlids)
# 循环输出层节点
for k in range(len(self.urlids)):
error = targets[k] - self.ao[k]
#print error
output_deltas[k] = self.dtanh(self.ao[k]) * error
print "输出层误差:", output_deltas[:]
# 计算隐含层的误差
hidden_deltas = [0.0] * len(self.hiddenids)
for i in range(len(self.hiddenids)):
error = 0.0
# 计算一个隐含层节点的误差需要累加所有输出层节点
for j in range(len(self.urlids)):
error += output_deltas[j] * self.wo[i][j]
hidden_deltas[i] = self.dtanh(self.ah[i]) * error
print "隐含层误差:", hidden_deltas[:]
# 更新输出权重
for i in range(len(self.hiddenids)):
for j in range(len(self.urlids)):
change = output_deltas[j] * self.ah[i]
self.wo[i][j] = self.wo[i][j] + N * change
# 更新输入权重
for i in range(len(self.wordids)):
for j in range(len(self.hiddenids)):
change = hidden_deltas[j] * self.ai[i]
self.wi[i][j] = self.wi[i][j] + N * change
# 训练函数:建立神经网络,运行前馈算法和反向传播
# 该方法接受 wordids列表,urlids列表 以及 一个选择的URL作为参数
def trainquery(self, wordids, urlids, selectedurl):
# 根据 wordids 和 urlids 生成一个隐藏节点
self.generatehiddennode(wordids, urlids)
# 建立神经网络
self.setupnetwork(wordids, urlids)
print self.wi[:]
# 前馈传播
self.feedforward()
# 设定目标URL,即正确的URL对应的输出节点值为1,其余节点为0
targets = [0.0] * len(urlids)
targets[urlids.index(selectedurl)] = 1.0
#print selectedurl, urlids.index(selectedurl), targets[:]
# 误差反向传播调整权值,完成一次训练
self.backPropagate(targets)
print "各层网络输出值:", self.ai[:], self.ah[:], self.ao[:]
print "各层网络权值矩阵: ", self.wo[:], self.wi[:]
self.feedforward()
self.updatedatabase()
# 使用函数 trainquery 完成一次训练后在末尾需要将更新后的权值写入数据库
def updatedatabase(self):
# 更新 输入层-隐含层 权值
for i in range(len(self.wordids)):
for j in range(len(self.hiddenids)):
self.setstrength(self.wordids[i], self.hiddenids[j], 0, self.wi[i][j])
# 更新 隐含层-输出层 权值
for i in range(len(self.hiddenids)):
for j in range(len(self.urlids)):
self.setstrength(self.hiddenids[i], self.urlids[j], 1, self.wo[i][j])
self.con.commit()
if __name__ == '__main__':
#1. 新建数据库,生成带有样例单词和URL ID的隐藏节点
"""
mynet = searchnet('nn.db')
#mynet.maketables()
wWorld, wRiver, wBank = 101, 102, 103
uWorldBank, uRiver, uEarth = 201, 202, 203
mynet.generatehiddennode([wWorld, wBank], [uWorldBank, uRiver, uEarth])
print 'wordhidden -------'
for c in mynet.con.execute('select * from wordhidden'):
print c
print '\nhiddenurl -------'
for c in mynet.con.execute('select * from hiddenurl'):
print c
print '\nhiddennode -------'
for c in mynet.con.execute('select * from hiddennode'):
print c
"""
"""
运行结果:
wordhidden -------
(101, 1, 0)
(103, 1, 0)
hiddenurl -------
(1, 201, 0)
(1, 202, 0)
(1, 203, 0)
hiddennode -------
(u'101_103',)
[Finished in 0.1s]
"""
#mynet = searchnet('nn.db')
#res = mynet.con.execute('select toid from wordhidden where fromid=103')
#for row1 in res:
# print row1[0]
# 初始化神经网络,运行前馈传播算法产生输出值
wWorld, wRiver, wBank = 301, 302, 303
uWorldBank, uRiver, uEarth = 401, 402, 403
mynet = searchnet('nn.db')
mynet.con.execute('delete from hiddennode')
wordids = [wWorld, wBank]
urlids = [uWorldBank, uRiver, uEarth]
mynet.trainquery(wordids, urlids, uWorldBank)
mynet.getresult(wordids, urlids)