python数据挖掘案例系列教程——python实现搜索引擎

腾讯AI架构师

已于 2022-03-27 16:53:37 修改

阅读量1.8w

点赞数 27

分类专栏： python 机器学习后端爬虫系列课程文章标签： python 机器学习数据挖掘搜索引擎神经网络

于 2018-01-07 20:32:07 首次发布

本文链接：https://blog.csdn.net/luanpeng825485697/article/details/78997189

版权

python 机器学习后端爬虫系列课程专栏收录该内容

175 篇文章 126 订阅

订阅专栏

全栈工程师开发手册（作者：栾鹏）

python数据挖掘系列教程

今天我们使用python实现一个网站搜索引擎。主要包含两个部分。网站数据库的生成、搜索引擎。其中搜索引擎部分我们使用单词频度算法、单词距离算法、外部回值算法、链接文本算法、pagerank算法和神经网络学习等6种算法来实现搜索排名。

我们这里将http://blog.csdn.net/luanpeng825485697站点下的所有网站当做一个小型服务器。对该网站域名下的网页进行搜索引擎设计。

由于需要获取博客文章的单词作为每篇博客的特征属性，并且我们的博客都是中文，所以在学习获取博客数据前，需要学习结巴中文分词库的使用。参考：
http://blog.csdn.net/luanpeng825485697/article/details/78757563

在我们的代码中，我们将模型固话在sqlite数据库中。所以在学习获取博客数据前，还需要学习sqlite3库的使用，参考：
http://blog.csdn.net/luanpeng825485697/article/details/78361168

由于我们会将所有的数据都存储在数据库中以便今后常用，所以代码中会有部分sqlite数据库的相关代码操作。

调试环境python3.6

gitup网址：https://github.com/626626cdllp/data-mining/tree/master/Search-Engines

第一部分、爬取网站生成网站数据库

这一部分，我们的目标是将域名下的所有网页的信息生成几个数据集。

这里写图片描述

sqlite数据库默认的主键索引名为rowid，mysql数据库默认的主键索引名为id。

第一张表urllist、保存的是已经url列表，字段hascrawl表示该网页是否已经爬取过。
第二张表wordlist、保存的是整个网站的单词列表，不包含重复单词。
第三张表wordlocation、保存的是单词在文档中所处位置的列表。单词的位置为单词在该文档内容所形成的单词列表中的索引。
第四张表link、保存了两个urlid，指明从一张表到另一张表的跳转关系。
第五张表linkwords、则利用wordid和linkid记录链接的描述。

下面我们就来爬取整个网站，生成这五张表，这里我们使用urllib获取响应，所以不能爬取js动态加载的网站。如果想爬取动态加载的网站，可以参考http://blog.csdn.net/luanpeng825485697/article/details/78436963

将下面的代码存储成spyder.py

# 根据连接爬取中文网站，获取标题、子连接、子连接数目、连接描述、中文分词列表，
import urllib
from bs4 import BeautifulSoup
import bs4
import sqlite3
import os
import jieba   #对中文进行分词
import traceback

# 分词时忽略下列词，即不为这些单词建立数据库
biaodian = '[!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~]+ ，。？‘’“”！；：\r\n、（）…'     #所有的标点符号
ignorewords = list(set(biaodian))   #去重
ignorewords.append('\r\n')   #另外添加一个忽略的项


# 定义爬虫类。获取链接的标题、描述、分词、深度
class crawler:
    def __init__(self,dbname):
        self.urls={}              #创建爬取列表
        self.con = sqlite3.connect(dbname)
        self.curs = self.con.cursor()
        try:
            self.createindextables()
        except:
            pass

    def __del__(self):  # 关闭数据库
        self.curs.close()
        self.con.close()

    def dbcommit(self):  # 保存数据库
        self.con.commit()

    # 创建数据库表
    def createindextables(self):
        self.curs.execute('create table urllist(url,hascrawl)')  # 创建url列表数据表
        self.curs.execute('create table wordlist(word)')  # 创建word列表数据表
        self.curs.execute('create table wordlocation(urlid,wordid,location)')  # 创建url-word-location数据表（每个链接下出现的所有单词和单词出现的位置）
        self.curs.execute('create table link(fromid integer,toid integer)')  # 创建url-url数据表（当前链接和目标链接对）
        self.curs.execute('create table linkwords(wordid,linkid)')  # 创建word-url数据表（链接描述）
        self.curs.execute('create index wordidx on wordlist(word)')  # 创建索引
        self.curs.execute('create index urlidx on urllist(url)')  # 创建索引
        self.curs.execute('create index wordurlidx on wordlocation(wordid)')  # 创建索引
        self.curs.execute('create index urltoidx on link(toid)')  # 创建索引
        self.curs.execute('create index urlfromidx on link(fromid)')  # 创建索引
        self.dbcommit()

	#获取一个dom内的所有单词。soup为dom元素
    def getword(self,soup):
        # 获取每个单词
        text=self.gettextonly(soup)   #提取所有显示出来的文本
        allword=self.separatewords(text)  #使用分词器进行分词
        return allword

    # 根据一个dom元素提取文字（不带标签的）。由外至内获取文本元素。style和script内的文字忽略
    def gettextonly(self,soup):
        v=soup.string
        if v==None:
            c=soup.contents   # 直接子节点的列表，将<tag>所有儿子节点存入列表
            resulttext=''
            for t in c:
                if t.name=='style' or t.name=='script':   #当元素为style和script和None时不获取内容
                    continue
                subtext=self.gettextonly(t)
                resulttext+=subtext+'\n'
            return resulttext.strip()
        else:
            if isinstance(v,bs4.element.Comment):   #代码中的注释不获取
                return ''
            return v.strip()

    # 使用结巴分词，同时去除标点符号
    def separatewords(self,text):
        seg_list = jieba.cut(text, cut_all=False)  #使用结巴进行中文分词
        allword = []
        for word in seg_list:
            if word not in ignorewords:
                allword.append(word)
                # print(allword)
        return allword

    #爬虫主函数
    def crawl(self,url,host):
        if url in self.urls and self.urls[url]['hascrawl']:return  # 如果网址已经存在于爬取列表中，并且已经爬取过，则直接返回
        try:
            if url in self.urls:    
                if self.urls[url]['hascrawl']:return
                else: self.urls[url]['hascrawl']=True  # 如果网址已存在，但是并没有爬取过，就设置hascrawl为True
            else:
                self.urls[url]={}
                self.urls[url]['hascrawl']=True
            response=urllib.request.urlopen(url)   #获取响应流
            text = str(response.read(), encoding='utf-8')   #转化为utf-8编码的字符串
            soup = BeautifulSoup(text, 'html.parser')   #解析成dom树

            links = soup('a')  #获取所有链接
            for link in links:
                if ('href' in dict(link.attrs)):   #如果链接元素存在href属性
                    newurl = urllib.parse.urljoin(url, link['href'])  #获取链接的绝对网址
                    if not host in newurl: continue  # 非服务范围网址不爬取，不记录
                    if newurl == url: continue  # 如果网址是当前网址，不爬取，不记录
                    if newurl.find("'") != -1: continue  # 包含'的链接不爬取，不记录
                    newurl = newurl.split('#')[0]  # 去掉#后的数据部分
                    if newurl[0:4] == 'http':  # 只处理http协议
                        if newurl not in self.urls:  # 将链接加入爬取列表
                            self.urls[newurl] = {}
                            self.urls[newurl]['hascrawl'] = False 
                        # 添加链接描述
                        linkText = self.gettextonly(link).strip()  # 获取链接的描述
                        self.addlinkref(url, newurl, linkText)  # 添加链接跳转对和链接描述

            self.addtoindex(url, soup.body)  # 创建链接列表库和链接-分词库
            self.dbcommit()    #保存
            return True
        except:
            traceback.print_exc()
            return False
            # print("Could not parse page %s" % url)


    # 添加链接列表库和链接-分词索引
    def addtoindex(self, url, soup):
        # 查询-增加网址获得urlid
        urlid = self.get_add_id('urllist', 'url', url)
        # 提取所有单词
        allword = self.getword(soup)  # 提取所有显示出来的文本
        print(allword)
        # 将每个单词与该url关联，写入到数据库
        index = 0
        for word in allword:
            index += 1
            # 查询-增加单词获得wordid 
            wordid = self.get_add_id('wordlist', 'word', word)
            self.curs.execute("insert into wordlocation(urlid,wordid,location) values (%d,%d,%d)" % (urlid, wordid, index))

    # 添加链接跳转对，和链接-描述文本。
    def addlinkref(self, urlFrom, urlTo, linkText):
        words = self.separatewords(linkText)
        fromid = self.get_add_id('urllist', 'url', urlFrom)   #参数：表名、列名、值
        toid = self.get_add_id('urllist', 'url', urlTo)   #参数：表名、列名、值
        if fromid == toid: return
        cur = self.curs.execute("insert into link(fromid,toid) values (%d,%d)" % (fromid, toid))
        linkid = cur.lastrowid
        for word in words:
            wordid = self.get_add_id('wordlist', 'word', word)   #参数：表名、列名、值
            self.curs.execute("insert into linkwords(linkid,wordid) values (%d,%d)" % (linkid, wordid))

    # 辅助函数，用于获取数据库中记录的id，并且如果记录不存在，就将其加入数据库中，再返回id
    def get_add_id(self, table, field, value):
        command = "select rowid from %s where %s='%s'" % (table, field, value)
        cur = self.curs.execute(command)
        res = cur.fetchall()
        if res == None or len(res) == 0:
            cur = self.curs.execute("insert into %s (%s) values ('%s')" % (table, field, value))
            return cur.lastrowid
        else:
            return res[0][0]   #返回第一行第一列

上面就完成了一个基本的爬虫类，当然今天我们还会在这个类中添加一些功能，以适应更加丰富的搜索。

下面我们就可以借用这个类来爬取一个网站了。

# 爬取指定域名范围内的所有网页，beginurl为开始网址，host为根网址
def crawlerhost(beginurl,host,dbname):
    mycrawler = crawler(dbname)      #定义爬虫对象
    mycrawler.crawl(beginurl,host)  #爬取主页
    for url in list(mycrawler.urls.keys()):
        print(url)
        mycrawler.crawl(url, host)  # 爬取子网页
       
    # 获取pageRank数据
    # mycrawler.calculatepagerank()

爬取部分算是完成了。下面就可以来测试一下了。

# 读取数据库信息，检验是否成功建立了搜索数据库
def readdb(dbname):
    con = sqlite3.connect(dbname)
    curs = con.cursor()
    command = "select fromid,toid from link"   #'select * from link'
    cur = curs.execute(command)
    res = cur.fetchall()
    allurl=[]
    if res != None and len(res) > 0:
        for row in res:
            print(row)
            command = "select url from urllist where rowid=%d" % row[0]
            fromurl = curs.execute(command).fetchall()[0][0]
            command = "select url from urllist where rowid=%d" % row[1]
            tourl = curs.execute(command).fetchall()[0][0]
            # print(fromurl,tourl)
            if fromurl not in allurl:
                allurl.append(fromurl)
            if tourl not in allurl:
                allurl.append(tourl)





if __name__ == '__main__':
    # 爬取服务器建立数据库
    url = 'http://blog.csdn.net/luanpeng825485697'
    # if os._exists('csdn.db'):
    #     os.remove('csdn.db')    #删除旧的数据库
    crawlerhost(url, url,'csdn.db')

    #读取数据库
    readdb('csdn.db')

第二部分、搜索

这一部分我们根据用户的输入，从数据库中获取相关的链接集。在下一部分链接集进行排序使用。

我们为搜索引擎新建一个模块，命名为searchengine.py

先引入一些必要的模块和变量

# 搜索和排名
import urllib
from bs4 import BeautifulSoup
import re
import sqlite3
import nn
import os
import spyder   #获取爬虫数据集
import jieba

# 分词时忽略下列词
biaodian = '[!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~]+ ，。？‘’“”！；：\r\n、（）…'     #所有的标点符号
ignorewords = list(set(biaodian))   #去重
ignorewords.append('\r\n')   #添加一个不忽略的项

在模块中我们先建立一个搜索引擎类。


# 定义搜索引擎类
class searcher:
    def __init__(self,dbname):
        self.con=sqlite3.connect(dbname)  #链接数据库
        self.curs = self.con.cursor()

    def __del__(self):
        self.curs.close()
        self.con.close()

搜索引擎类的一个任务就是能根据用户搜索的字符串分割成多个关键词，再查询到相关的网址。而只要文章中出现了每个关键词，就认为网页相关。这一部分代码比较难懂，建议读者边运行边学习。

查询过程：

1、对输入字符串进行分词获得多个关键词。例如我们查询“腾讯股票”，会分词成“腾讯”、“股票”
2、查询每个关键词在单词库wordlist数据库中的索引。例如我们会查询到“腾讯”在单词库中的索引为126，“股票在单词库中的索引为127”
3、利用这两个关键词索引把所有包含这两个关键词的链接以及这两个关键词在链接中出现的位置查询出来。形成以下的数据格式

[
	 (urlid1,wordlocation1_1,wordlocation2_1),  # 链接1中，单词1的位置1，单词2的位置1
	 (urlid1,wordlocation1_1,wordlocation2_2),  # 链接1中，单词1的位置1，单词2的位置2
	 (urlid1,wordlocation1_1,wordlocation2_3),  # 链接1中，单词1的位置1，单词2的位置3
	 (urlid1,wordlocation1_2,wordlocation2_1),  # 链接1中，单词1的位置2，单词2的位置1
	 (urlid1,wordlocation1_2,wordlocation2_2),  # 链接1中，单词1的位置2，单词2的位置2
	 (urlid1,wordlocation1_2,wordlocation2_3),  # 链接1中，单词1的位置2，单词2的位置3
	 ...
	 (urlid2,wordlocation1_1,wordlocation2_1),  # 链接2中，单词1的位置1，单词2的位置1
	 (urlid2,wordlocation1_1,wordlocation2_2),  # 链接2中，单词1的位置1，单词2的位置2
	 ...
]

在下面的语句中我们会用到一个数据库查询语句。
例如我们查询了字符串中包含“腾讯”、“股票”两个关键词，在单词列表wordlist中的索引分别为126、和127。

则我们使用下面的语法查询出数据集。

select w0.urlid,w0.location,w1.location from wordlocation w0,wordlocation w1 where w0.wordid=126 and w0.urlid=w1.urlid and w1.wordid=127

上面的代码先看where后面的语句。w0表示一个数据表，w1表示另一个数据表。
下面的语句表示满足条件：
w0表的wordid字段为126的所有w0中的记录
w1表的wordid字段为127的所有w1中的记录
w0表的urlid字段与w1表中urlid字段存在相同值的所有w0中的记录和w1中的所有记录。
where后的语句查询上述数据集的交集。

where w0.wordid=126 and w0.urlid=w1.urlid and w1.wordid=127

再来看from后面的语句。表示wordlocation表可以用w0这个名称代替，wordlocation这个表也可以用w1这个名称代替。

from wordlocation w0,wordlocation w1

最后来看select后的语句。上面查询到的数据集既有w0中的又有w1中的，把w0中记录的urlid字段、w0中记录的location字段、w1中记录的location字段提取出来。

select w0.urlid,w0.location,w1.location

最终形成了我们想要的数据集格式。

[
	 (urlid1,wordlocation1_1,wordlocation2_1),  # 链接1中，单词1的位置1，单词2的位置1
	 (urlid1,wordlocation1_1,wordlocation2_2),  # 链接1中，单词1的位置1，单词2的位置2
	 (urlid1,wordlocation1_1,wordlocation2_3),  # 链接1中，单词1的位置1，单词2的位置3
	 (urlid1,wordlocation1_2,wordlocation2_1),  # 链接1中，单词1的位置2，单词2的位置1
	 (urlid1,wordlocation1_2,wordlocation2_2),  # 链接1中，单词1的位置2，单词2的位置2
	 (urlid1,wordlocation1_2,wordlocation2_3),  # 链接1中，单词1的位置2，单词2的位置3
	 ...
	 (urlid2,wordlocation1_1,wordlocation2_1),  # 链接2中，单词1的位置1，单词2的位置1
	 (urlid2,wordlocation1_1,wordlocation2_2),  # 链接2中，单词1的位置1，单词2的位置2
	 ...
]

查询实现代码如下：

# 使用结巴分词，去除标点符号
    def separatewords(self, text):
        seg_list = jieba.cut(text, cut_all=False)  # 使用结巴进行中文分词
        allword = []
        for word in seg_list:
            if word not in ignorewords:
                allword.append(word)
        # print(allword)
        return allword

    # 根据搜索字符串分词后获取查询到的链接
    def getmatchrows(self,querystr):
        # 构造数据库的查询字符串（搜索字符串根据空格分割成查询字符串列表）
        fieldlist='w0.urlid'
        tablelist=''
        clauselist=''
        wordids=[]

        # words=querystr.strip().split(' ')   # 根据空格分割单词
        words = self.separatewords(querystr)  #使用结巴进行中文分词
        tablenumber=0
        for word in words:
            # 获取单词的id
            wordrow=self.curs.execute("select rowid from wordlist where word='%s'" % word).fetchall()
            if wordrow!=None and len(wordrow)> 0:
                wordid=wordrow[0][0]  #获取单词id
                wordids.append(wordid)
                if tablenumber>0:
                    tablelist+=','
                    clauselist+=' and '
                    clauselist+='w%d.urlid=w%d.urlid and ' % (tablenumber-1,tablenumber)
                fieldlist+=',w%d.location' % tablenumber
                tablelist+='wordlocation w%d' % tablenumber
                clauselist+='w%d.wordid=%d' % (tablenumber,wordid)
                tablenumber+=1

        # 根据各个组分，建立查询。为列表中的每个单词，建立指向wordlocation表的引用，并根据对应的urlid将它们连接起来进行联合查询
        fullquery='select %s from %s where %s' % (fieldlist,tablelist,clauselist)
        # print(fullquery)
        cur=self.curs.execute(fullquery)
        rows=[row for row in cur.fetchall()]

        return rows,wordids

经过上面的查询语句，返回的rows，wordids。其中
wordids样式如[126, 127]，表示每个查询关键词在单词库wordlist数据库中的索引。这个比较简单。

而rows的样式比较复杂，但他是理解后续代码的关键。

下面假设搜索字符串解析出了两个关键词。并且这两个关键词在某一个网页中可能都出现了很多次。
rows的样式为

[
	 (urlid1,wordlocation1_1,wordlocation2_1),  # 链接1中，单词1的位置1，单词2的位置1
	 (urlid1,wordlocation1_1,wordlocation2_2),  # 链接1中，单词1的位置1，单词2的位置2
	 (urlid1,wordlocation1_1,wordlocation2_3),  # 链接1中，单词1的位置1，单词2的位置3
	 (urlid1,wordlocation1_2,wordlocation2_1),  # 链接1中，单词1的位置2，单词2的位置1
	 (urlid1,wordlocation1_2,wordlocation2_2),  # 链接1中，单词1的位置2，单词2的位置2
	 (urlid1,wordlocation1_2,wordlocation2_3),  # 链接1中，单词1的位置2，单词2的位置3
	 ...
	 (urlid2,wordlocation1_1,wordlocation2_1),  # 链接2中，单词1的位置1，单词2的位置1
	 (urlid2,wordlocation1_1,wordlocation2_2),  # 链接2中，单词1的位置1，单词2的位置2
	 ...
]

可以看出列表中的每个元组中，第一列代表是网址id，后面的列是每个关键词出现的位置的一种组合。

第三部分、排名算法

上面已经根据用户的输入获取到了相关的网址数据。
获取到的数据中rows的形式如下
[(urlid1,wordlocation1_1,wordlocation1_2,wordlocation1_3…),(urlid2,wordlocation2_1,wordlocation2_2,wordlocation2_3…)]
列表的每个元素是一个元组，每个元素的内容是urlid和每个关键词在该文档中的位置。

wordids形式为[wordid1, wordid2, wordid3…]，即每个关键词所对应的单词id

我们将会介绍几种排名算法，所谓排名也就是根据各自的规则为每个链接评分，评分越好。并且最终我们会将几种排名算法综合利用起来，给出最终的排名。既然要综合利用，那么我们就要先实现每种算法。在综合利用时会遇到几个问题。

1、每种排名算法评分机制不同，给出的评分尺度和含义也不尽相同
2、如何综合利用，要考虑每种算法的效果。为效果好的给与较大的权重。

我们先来考虑第一个问题，如何消除每种评分算法所给出的评分尺度和含义不相同的问题。
第2个问题，等研究完所有的算法以后再来考虑。

简单，使用归一化，将每个评分值缩放到0-1上，1代表最高，0代表最低。

# 评价值归一化：因为不同的评价方法的返回值和含义不同。这里所有的评价值归一化到0-1,默认越大越好
    def normalizescores(self,scores,smallIsBetter=0):
        vsmall=0.00001 #避免被0整除
        if smallIsBetter:
            minscore=min(scores.values())
            return dict([(u,float(minscore)/max(vsmall,l)) for (u,l) in scores.items()])
        else:
            maxscore=max(scores.values())
            if maxscore==0: maxscore=vsmall
            return dict([(u,float(c)/maxscore) for (u,c) in scores.items()])

下面我们来研究每一种算法。

第1个排名算法：根据单词位置进行评分的函数

我们可以认为对用户输入的多个关键词，在文档中，这些关键词出现的位置越靠前越好。比如我们往往习惯在文章的前面添加一些摘要性、概括性的描述。

    # 根据单词位置进行评分的函数.
    # rows是[(urlid1,wordlocation1_1,wordlocation1_2,wordlocation1_3...),(urlid2,wordlocation2_1,wordlocation2_2,wordlocation2_3...)]
    def locationscore(self,rows):
        locations=dict([(row[0],1000000) for row in rows])
        for row in rows:
            loc=sum(row[1:]) #计算每个链接的单词位置总和，越小说明越靠前
            if loc<locations[row[0]]:  #记录每个链接最小的一种位置组合
	            locations[row[0]]=loc

        return self.normalizescores(locations,smallIsBetter=1)

第2个排名算法：根据单词频度进行评价的函数

我们可以认为对用户输入的多个关键词，在文档中，这些关键词出现的次数越多越好。比如我们在指定主题的文章中会反复提到这个主题。

  # 根据单词频度进行评价的函数
    # rows是[(urlid1,wordlocation1_1,wordlocation1_2,wordlocation1_3...),(urlid2,wordlocation2_1,wordlocation2_2,wordlocation2_3...)]
    def frequencyscore(self,rows):
        counts=dict([(row[0],0) for row in rows])
        for row in rows: 
	        counts[row[0]]+=1   #统计每个链接出现的组合数目。 每个链接只要有一种位置组合就会保存一个元组。所以链接所拥有的组合数，能一定程度上表示单词出现的多少。
        return self.normalizescores(counts)

第3个排名算法：根据单词距离进行评价的函数

我们可以认为对用户输入的多个关键词，在文档中，这些关键词出现的越紧凑越好。这是因为我们更希望所有单词出现在一句话中，而不是不同的关键词出现在不同段落或语句中。

  # 根据单词距离进行评价的函数。
    # rows是[(urlid1,wordlocation1_1,wordlocation1_2,wordlocation1_3...),(urlid2,wordlocation2_1,wordlocation2_2,wordlocation2_3...)]
    def distancescore(self,rows):
        # 如果仅查询了一个单词，则得分都一样
        if len(rows[0])<=2: return dict([(row[0],1.0) for row in rows])

        # 初始化字典，并填入一个很大的值
        mindistance=dict([(row[0],1000000) for row in rows])

        for row in rows:
            dist=sum([abs(row[i]-row[i-1]) for i in range(2,len(row))]) # 计算每种组合中每个单词之间的距离
            if dist<mindistance[row[0]]:  # 计算每个链接所有组合的距离。并为每个链接记录最小的距离
	            mindistance[row[0]]=dist
        return self.normalizescores(mindistance,smallIsBetter=1)

第4个排名算法：利用指向该链接的链接数目进行评价

我们可以认为查询到的相关链接中，如果有较多的其他链接在文档中指向了当前链接，这该链接内容质量比较好。

  # 利用指向该链接的链接数目进行评价（仅计算回指数目）。
    # rows是[(urlid1,wordlocation1_1,wordlocation1_2,wordlocation1_3...),(urlid2,wordlocation2_1,wordlocation2_2,wordlocation2_3...)]
    def inboundlinkscore(self,rows):
        uniqueurls=dict([(row[0],1) for row in rows])
        inboundcount=dict([(u,self.curs.execute('select count(*) from link where toid=%d' % u).fetchall()[0]) for u in uniqueurls])
        return self.normalizescores(inboundcount)

第5个排名算法：pagerank算法

互联网中的网页可以看出是一个有向图，其中网页是结点，如果网页A有链接到网页B，则存在一条有向边A->B，下面是一个简单的示例：

这里写图片描述

这个例子中只有四个网页，如果当前在A网页，那么悠闲的上网者将会各以1/3的概率跳转到B、C、D，这里的3表示A有3条出链，如果一个网页有k条出链，那么跳转任意一个出链上的概率是1/k，同理D到B、C的概率各为1/2，而B到C的概率为0。

在跳转有向图中不考虑指向自己的链接

由于上网者并不一定会点击网络中的链接跳转到新的网址，所以我们需要设置阻尼系数，代表用户点击链接进入新网址的概率。

另外每一个网页本身的重要性也不同，被不同重要性的网址所引用自然也就代表了不同的重要性。比如网址D是一个非常重要的网址，在D中指向了网址B和C，则我们可以认为网址B和网址C也很重要。而网址的重要性又是通过pagerank所表示的，所以这是一个迭代收敛获取每个网址pagerank的过程。

由于每个网址的链接情况是相对固定的，每个网址的pagerank我们可以在用户请求前就计算好，将每个网址的pagerank值保存到数据库中，待用户请求时，直接查询相关网址的pagerank值就可以了。

第一步、提前迭代计算每个网址的pagerank的值。

每个链接的pagerank=每个指向此链接的网页的pagerank/该网页中的链接总数*0.85+0.15，其中0.85表示阻尼因子，表示网页是否点击该网页中的链接。

先初始化每个网页的pagerank的值为1，然后按照上面的方法迭代n次计算每个网页的pagerank。

我们在spyder.py模块的crawler类中添加下面的函数

# （每个链接的pagerank=指向此链接的网页的pagerank/网页中的链接总数*0.85+0.15，其中0.85表示阻尼因子，表示网页是否点击该网页中的链接）
    # pagerank算法，离线迭代计算，形成每个链接的稳定pagerank值。iterations为迭代计算的次数
    def calculatepagerank(self, iterations=20):
        # 清除您当前的pagerank表
        self.curs.execute('drop table if exists pagerank')
        self.curs.execute('create table pagerank(urlid primary key,score)')

        # 初始化每个url，令其pagerank的值为1
        for (urlid,) in self.curs.execute('select rowid from urllist').fetchall():
            self.curs.execute('insert into pagerank(urlid,score) values (%d,1.0)' % urlid)
        self.dbcommit()

        for i in range(iterations):
            print("Iteration %d" % (i))
            for (urlid,) in self.curs.execute('select rowid from urllist').fetchall():
                pr = 0.15
                # 循环遍历指向当前网页的所有其他网页
                for (linker,) in self.curs.execute('select distinct fromid from link where toid=%d' % urlid).fetchall():
                    # 得到链接源对应网页的pagerank值
                    linkingpr = self.curs.execute('select score from pagerank where urlid=%d' % linker).fetchall()[0][0]
                    # 根据链接源求总的链接数
                    linkingcount = self.curs.execute('select count(*) from link where fromid=%d' % linker).fetchall()[0][0]
                    pr += 0.85 * (linkingpr / linkingcount)
                self.curs.execute('update pagerank set score=%f where urlid=%d' % (pr, urlid))
            self.dbcommit()

第二步、查询网址的pagerank的值为相关网址进行排名

调用函数生成网址pagerank数据库以后，就可以通过pagerank对查询到的网址进行排名了。

# 根据pagerank值进行评价的函数。（利用外部回值链接进行评价）。
# rows是[(urlid1,wordlocation1_1,wordlocation1_2,wordlocation1_3...),(urlid2,wordlocation2_1,wordlocation2_2,wordlocation2_3...)]
    def pagerankscore(self,rows):
        pageranks=dict([(row[0],self.curs.execute('select score from pagerank where urlid=%d' % row[0]).fetchall()[0][0]) for row in rows])
        maxrank=max(pageranks.values())   #求最大的pagerank值
        for urlid in pageranks:
            pageranks[urlid] /= maxrank   #归一化
        return pageranks   #返回归一化的url的pagerank

第6个排名算法：利用链接文本进行评价的函数

有时，相比于被链接的网址自身所提供的信息而言，我们从指向该网页的链接中所得到的信息会更有价值。因为针对其所指向的网页，网站的开发者会倾向于提供一些解释其内容的简短描述。

# 利用链接文本进行评价的函数。
    # rows是[(urlid1,wordlocation1_1,wordlocation1_2,wordlocation1_3...),(urlid2,wordlocation2_1,wordlocation2_2,wordlocation2_3...)]
    def linktextscore(self,rows,wordids):
        linkscores=dict([(row[0],0) for row in rows])
        for wordid in wordids:
            cur=self.curs.execute('select link.fromid,link.toid from linkwords,link where wordid=%d and linkwords.linkid=link.rowid' % wordid)
            for (fromid,toid) in cur.fetchall():
                if toid in linkscores:
                    pr=self.curs.execute('select score from pagerank where urlid=%d' % fromid).fetchall()[0][0]
                    linkscores[toid]+=pr
        maxscore=max(linkscores.values())   #求最大的pagerank值
        for urlid in linkscores:
            linkscores[urlid] /= maxscore   #归一化
        return linkscores

第五部分、综合利用各种评分算法，将查询到的网址进行评分排名

# 对查询到的链接进行排名。参数：rows，wordids查询字符串id列表
    def getscoredlist(self,rows,wordids):
        totalscores=dict([(row[0],0) for row in rows])
        # 对链接进行评价的函数。（权重和评价值），使用了多种评价函数
        weights=[(1.0,self.locationscore(rows)),   #根据关键词出现的位置获取权重
                 (1.0,self.frequencyscore(rows)),  #根据关键词出现的频率获取权重
                 (1.0,self.pagerankscore(rows)),   #根据pagerank获取权重
                 (1.0,self.linktextscore(rows,wordids)), #根据链接描述获取权重
                 #(5.0,self.nnscore(rows,wordids))   #根据神经网络获取权重
                 ] 
        for (weight,scores) in weights:
            for urlid in totalscores:
                totalscores[urlid]+=weight*scores[urlid]

        return totalscores  #返回每个链接的评价值

现在我们已经能根据用户输入字符串，分词成多个关键词，查询相关网址，对网址进行评分。那只要按照分数进行排序，给出排名靠前的搜索引擎就完成了。


    #搜索函数：将上面的搜索、评价、排名合并在一起。querystr为用户输入字符串
    def query(self,querystr):
        rows,wordids=self.getmatchrows(querystr)  #rows是[(urlid1,wordlocation1_1,wordlocation1_2,wordlocation1_3...),(urlid2,wordlocation2_1,wordlocation2_2,wordlocation2_3...)]
        # print(rows)
        rows = list(set(rows))   #去重，位置组合重复计算没有意义，虽然说，所有的都重复了不影响效果。但是计算量增大了。
        if rows==None or len(rows)==0:
            print('无法查询到,请使用空格分隔查询关键词')
            return

        scores=self.getscoredlist(rows,wordids)
        rankedscores=[(score,url) for (url,score) in scores.items()]
        rankedscores.sort()
        rankedscores.reverse()
        for (score,urlid) in rankedscores[0:10]:
            print('%f\t%d\t%s' % (score,urlid,self.geturlname(urlid)))
        return wordids,[r[1] for r in rankedscores[0:10]]


#根据urlid查询url
    def geturlname(self,id):
        return self.curs.execute("select url from urllist where rowid=%d" % id).fetchall()[0][0]

测试

mynet=nn.searchnet('csdn.db')
if __name__ == '__main__':
    mysearcher= searcher('csdn.db')
    searchkey = input("搜索关键词>")
    wordids,urlids=mysearcher.query(searchkey)
    print(wordids,urlids)

我们输入“腾讯股票爬虫”

返回的结果为

3.421882	56	http://blog.csdn.net/luanpeng825485697/article/details/78451057
3.026871	58	http://blog.csdn.net/luanpeng825485697/article/details/78442062
1.472765	1	http://blog.csdn.net/luanpeng825485697
1.386459	2	http://blog.csdn.net/luanpeng825485697?viewmode=contents
1.220016	51	http://blog.csdn.net/luanpeng825485697/article/category/7225092
1.220016	48	http://blog.csdn.net/luanpeng825485697/article/category/7083783
1.167393	43	http://blog.csdn.net/luanpeng825485697/article/category/7060769
1.167161	50	http://blog.csdn.net/luanpeng825485697/article/category/7190343
1.152465	47	http://blog.csdn.net/luanpeng825485697/article/category/7145481
1.105118	7	http://blog.csdn.net/luanpeng825485697/article/details/78154417

搜索引擎设计成功。

通过神经网络学习用户点击行为

在上面的搜索引擎设计中我们实现了6种排名算法。下面我们要再介绍一种排名算法。由于这种算法的设计本身并不对文档内容进行解析，而只是对用户行为进行解析，内容较多，知识较难，所以我们单独来讲。

下面要讲的是通过神经网络来实现搜索引擎。没有学习过深度学习的同学可以先不用看这一部分。

神经网络设计搜索引擎，简单的说就是会记忆用户的选择行为。比如搜索引擎根据已有的算法为用户的搜索返回了一些网址。但是这些网址究竟好不好呢。网站并没有这样一个反馈机制。而搜索引擎就是为这样一个反馈机制存在的。当用户选择了一个推荐的网址，神经网络就会为这个网址添加一定的权重，下次搜索相同的内容，更加情愿优先推荐权重大的网址。

将下面的代码存储成nn.py

# 神经网络学习用户点击行为，实现搜索排名
from math import tanh
import sqlite3

# 指定任何输出值的斜率
def dtanh(y):
    return 1.0-y*y

class searchnet:
    def __init__(self,dbname):
        self.con=sqlite3.connect(dbname)  #链接数据库
        self.curs = self.con.cursor()
        try:
            self.maketables()
        except:
            pass

    def __del__(self):
        self.curs.close()
        self.con.close()

    def dbcommit(self):  # 保存数据库
        self.con.commit()

    def maketables(self):
        self.curs.execute('create table hiddennode(create_key)')              #创建神经元节点
        self.curs.execute('create table wordhidden(fromid,toid,strength)')   #输入节点（单词）到神经元的权重，默认-0.2
        self.curs.execute('create table hiddenurl(fromid,toid,strength)')    #神经元到输出节点（链接）的权重，默认为0
        self.dbcommit()

        # 获取节点间权重。layer为0表示输入到神经元之间，为1表示神经元到输出之间
    def getstrength(self,fromid,toid,layer):
        if layer==0: table='wordhidden'
        else: table='hiddenurl'
        res=self.curs.execute('select strength from %s where fromid=%d and toid=%d' % (table,fromid,toid)).fetchall()
        if res==None or len(res)==0:
            if layer==0: return -0.2  # 输入到神经元之间默认为-0.2
            if layer==1: return 0   # 神经元到输出节点间默认为0
        return res[0][0]

    # 设置数据库中节点间权重。layer为0表示输入到神经元之间，为1表示神经元到输出之间
    def setstrength(self,fromid,toid,layer,strength):
        if layer==0: table='wordhidden'
        else: table='hiddenurl'
        res=self.curs.execute('select rowid from %s where fromid=%d and toid=%d' % (table,fromid,toid)).fetchall()
        if res==None or len(res)==0:
            self.curs.execute('insert into %s (fromid,toid,strength) values (%d,%d,%f)' % (table,fromid,toid,strength))
        else:
            rowid=res[0][0]
            self.curs.execute('update %s set strength=%f where rowid=%d' % (table,strength,rowid))

     # 根据已知正确的输入输出结果，创建神经元,建立输入与神经元间里连接，神经元与输出间连接。每一种输入组合都创建一个神经节点
    def generatehiddennode(self,wordids,urlids):
        # if len(wordids)>3: return None   #对于有3个输出单词的我们不做处理，太复杂
        # 检测我们是否已经为这组输入建好了一个节点
        sorted_words=[str(id) for id in wordids]
        sorted_words.sort()
        createkey='_'.join(sorted_words)
        res=self.curs.execute("select rowid from hiddennode where create_key='%s'" % createkey).fetchall()

        # 如果没有，就创建
        if res==None or len(res)==0:
            cur=self.curs.execute("insert into hiddennode (create_key) values ('%s')" % createkey)
            hiddenid=cur.lastrowid
            # 设置默认权重
            for wordid in wordids:
                self.setstrength(wordid,hiddenid,0,1.0/len(wordids))   #设置输入节点到神经元间的权重为1/输入数量
            for urlid in urlids:
                self.setstrength(hiddenid,urlid,1,0.1)   #设置神经元到输出节点间权重为0.1
            self.dbcommit()

    # 根据输入关键词id和相关的链接的id，获取数据库中神经元节点的id。（相关的链接也就是初步查询到的链接，只要链接的网页中出现过关键词就会被认为相关）
    def getallhiddenids(self,wordids,urlids):
        hiddenids=[]
        for wordid in wordids:
            cur=self.curs.execute('select toid from wordhidden where fromid=%d' % wordid).fetchall()
            for row in cur:
                hiddenids.append(row[0])
        for urlid in urlids:
            cur=self.curs.execute('select fromid from hiddenurl where toid=%d' % urlid).fetchall()
            for row in cur:
                hiddenids.append(row[0])
        return hiddenids  # 返回神经元节点

    # 构建一个神经网络
    def setupnetwork(self,wordids,urlids):
        # 值列表：输入：神经元、输出
        self.wordids=wordids
        self.hiddenids=self.getallhiddenids(wordids,urlids)
        self.urlids=urlids

        # 构建输入节点、神经元、输出节点。就是前面的输入、神经元、输出。这里用了一个更加普遍的名称
        self.ai = [1.0]*len(self.wordids)
        self.ah = [1.0]*len(self.hiddenids)
        self.ao = [1.0]*len(self.urlids)

        # 建立权重矩阵（线性组合系数矩阵）：输入-神经元，  神经元-输出
        self.wi = [[self.getstrength(wordid,hiddenid,0)
                    for hiddenid in self.hiddenids]
                    for wordid in self.wordids]
        self.wo = [[self.getstrength(hiddenid,urlid,1)
                    for urlid in self.urlids]
                    for hiddenid in self.hiddenids]

    # 前馈算法：一列输入，进入神经网络，返回所有输出结果的活跃程度。越活跃也好。因为神经网络每向下传播一层就会衰弱一层。衰弱函数使用tanh这种0时陡峭，无限大或无限小时平稳的函数
    def feedforward(self):
        # 查询的单词是仅有的输入
        for i in range(len(self.wordids)):
            self.ai[i] = 1.0   #输入节点的活跃程度就设为1

        # 根据输入节点活跃程度，获取神经元节点的活跃程度
        for j in range(len(self.hiddenids)):
            sum = 0.0
            for i in range(len(self.wordids)):
                sum = sum + self.ai[i] * self.wi[i][j]  # 线性组合
            self.ah[j] = tanh(sum)   #使用tanh表示神经元对输入的反应强度。（因为tanh是一个在0附近震荡强烈，在远离0时趋于稳定的函数）

        # 根据神经元节点活跃程度，获取输出节点的活跃程度
        for k in range(len(self.urlids)):
            sum = 0.0
            for j in range(len(self.hiddenids)):
                sum = sum + self.ah[j] * self.wo[j][k]  # 线性组合
            self.ao[k] = tanh(sum)

        return self.ao[:]

# =============================以上是公共实例函数=======================================

#=======================getresult是应用神经网络进行搜索的函数==============================

    #  针对一组单词和url给出输出
    def getresult(self,wordids,urlids):
        self.setupnetwork(wordids,urlids)
        return self.feedforward()




# ============================下面是使用反向传播法进行神经网络训练============================


 #前馈训练法：依据当前权重预测输出，计算误差，更正权重。用户每选择一次，进行一次训练

  #用户每选择一次链接，就调整一次权重。targets表示正确的输出结果。即用户选择的链接
    def backPropagate(self, targets, N=0.5):
        # 计算输出层误差
        output_deltas = [0.0] * len(self.urlids)
        for k in range(len(self.urlids)):
            error = targets[k]-self.ao[k]    #计算正确输出和预测输出之间的误差
            output_deltas[k] = dtanh(self.ao[k]) * error   #确定总输入需要如何改变

        # 计算神经元误差：
        hidden_deltas = [0.0] * len(self.hiddenids)
        for j in range(len(self.hiddenids)):
            error = 0.0
            for k in range(len(self.urlids)):
                error = error + output_deltas[k]*self.wo[j][k]  #将每个神经元-输出间的权重值乘以输出节点的改变量，再累加求和，从而改变节点的输出结果
            hidden_deltas[j] = dtanh(self.ah[j]) * error  #确定节点的总输入所需的该变量

        # 更新神经元-输出间权重
        for j in range(len(self.hiddenids)):
            for k in range(len(self.urlids)):
                change = output_deltas[k]*self.ah[j]
                self.wo[j][k] = self.wo[j][k] + N*change

        # 更新输入-神经元间权重
        for i in range(len(self.wordids)):
            for j in range(len(self.hiddenids)):
                change = hidden_deltas[j]*self.ai[i]
                self.wi[i][j] = self.wi[i][j] + N*change


    # 更新数据库
    def updatedatabase(self):
        # 将值写入数据库
        for i in range(len(self.wordids)):
            for j in range(len(self.hiddenids)):
                self.setstrength(self.wordids[i],self. hiddenids[j],0,self.wi[i][j])
        for j in range(len(self.hiddenids)):
            for k in range(len(self.urlids)):
                self.setstrength(self.hiddenids[j],self.urlids[k],1,self.wo[j][k])
        self.dbcommit()


   # 对神经网络进行一次训练。wordids为查询关键词id，urlids为查找到相关的url的id，selectedurl为选中的url的id
    def trainquery(self,wordids,urlids,selectedurlid):
        # 第一次运行时，在数据库中创建表。以默认权重赋值
        self.generatehiddennode(wordids,urlids)
        # 创建神经网络类的属性。（输入、神经元、输出、两个权重）
        self.setupnetwork(wordids,urlids)
        self.feedforward()   #执行前馈算法，根据输入获取输出
        targets=[0.0]*len(urlids)
        targets[urlids.index(selectedurlid)]=1.0  #获取用户选择的正确链接
        error = self.backPropagate(targets)  #执行反向传播法修正网络
        self.updatedatabase()   #更新数据库


if __name__ == '__main__':
    con = sqlite3.connect('csdn.db')
    curs = con.cursor()
    command = 'select * from hiddennode'  # "select fromid,toid from link"
    cur = curs.execute(command)
    res = cur.fetchall()
    if res != None and len(res) > 0:
        for row in res:
            print(row)

根据长时间的用户行为，为新搜索相关的网址进行排名

# 根据神经网络（用户点击行为学习）进行评价的函数。神经网络在nn.py中实现。
    # rows是[(urlid1,wordlocation1_1,wordlocation1_2,wordlocation1_3...),(urlid2,wordlocation2_1,wordlocation2_2,wordlocation2_3...)]
    def nnscore(self,rows,wordids):
        # 获得一个由唯一的url id构成的有序列表
        urlids=[urlid for urlid in dict([(row[0],1) for row in rows])]
        nnres=mynet.getresult(wordids,urlids)
        scores=dict([(urlids[i],nnres[i]) for i in range(len(urlids))])
        return self.normalizescores(scores)

有了上面的排名算法，就可以添加到之前的综合评分函数中去了。

腾讯AI架构师

关注

27
点赞
踩
150

收藏

觉得还不错? 一键收藏
打赏
8
评论
python数据挖掘案例系列教程——python实现搜索引擎

全栈工程师开发手册（作者：栾鹏） python数据挖掘系列教程今天我们使用python实现一个网站搜索引擎。主要包含两个部分。网站数据库的生成、搜索引擎。其中搜索引擎部分我们使用单词频度算法、单词距离算法、外部回值算法、链接文本算法、pagerank算法和神经网络学习等6种算法来实现搜索排名。我们这里将http://blog.csdn.net/luanpeng82548569
复制链接

扫一扫