搜索引擎原理简介和简单实现

最新推荐文章于 2024-04-29 22:12:15 发布

追寻梦土

最新推荐文章于 2024-04-29 22:12:15 发布

阅读量556

点赞数 2

分类专栏：人工智能

本文链接：https://blog.csdn.net/y15166022470/article/details/51126299

版权

人工智能专栏收录该内容

4 篇文章 0 订阅

订阅专栏

搜索引擎两个模块 1 数据收集 2 信息提取

数据收集

数据收集使用爬虫算法。将爬虫素搜索的所有的的内容切分成单词，然后将单词和网址的关系存放在数据库中。

编写爬虫可以使用urllib2
提取网页信息可以使用beautifulsoup
存储网页信息可以使用sqlite.

信息提取

数据查根据关键词，查找符合条件的URL。然后定义根据评估算法，计算URL的排名，排名最靠前的排在第一位。排名算法衡量的因子有以下几种：

单词频度
单词出现的位置（一般靠前的都是主题）
单词之间的距离网页被引用的次数
pageRank(用户点击任意次数到达这个网页的可达性，网页初始化因子可以相同，经过几次迭代，慢慢趋近合理值)
链接文本。

Demo代码

这个搜索引擎只是一个简单的demo,用来展现基本搜索引擎的原理，百度和谷歌最大的难点在于海量数据处理。

import urllib2
from bs4 import BeautifulSoup
from  urlparse import   urljoin
import  sqlite3
import  re

ignoreWord =set(['the','of','to','and','it','in'])
class crawker:
    def __init__(self):
        self.db = sqlite3.connect("search.db")
    def __del__(self):
        self.db.close()
    def dbCommit(self):
        self.db.commit()
    def createIndexTable(self):
        self.db.execute('create table if not exists urlList(url)')
        self.db.execute('create table  if not exists wordList(word)')
        self.db.execute('create table if not exists wordLocation(urlID,wordID,location)')
        self.db.execute('create table  if not exists link(fromId interger,toId INTEGER)')
        self.db.execute('create table if not exists  linkWords(word,link)')

    def getEnteryId(self,table,field,value,createnew=True):
        cur = self.db.execute("select rowid from %s where %s='%s'"%(table,field,value))
        res = cur.fetchone()
        if res == None:
            cur = self.db.execute('insert into %s(%s) VALUES("%s")' %(table,field,value))
            return cur.lastrowid
        else:
            return res[0]
    def addToIndex(self,url,soup):
        if self.isIndexed(url): return
        print 'Indexing %s' %url
        text = self.getTextOnly(soup)
        words = self.separate(text)
        urlid = self.getEnteryId("urlList","url",url)
        for i in range(len(words)):
            word = words[i]
            if word in ignoreWord:continue
            wordid = self.getEnteryId('wordList','word',word)
            self.db.execute('insert into wordLocation VALUES(%d,%d,%d)' %(urlid,wordid,i))
    def getTextOnly(self,soup):
        v = soup.string
        if v == None:
            c = soup.contents
            results = ''
            for t in c:
                subText = self.getTextOnly(t)
                results = results + subText + '\n'
            return  results
        else:
            return  v.strip()
    def separate(self,text):
        splitter = re.compile('\\W*')
        return [s.lower() for s in splitter.split(text) if s!=""]
    def isIndexed(self,url):
        u = self.db.execute("select rowid from urlList where url='%s'" %url).fetchone()
        if u != None:
            v = self.db.execute("select rowid from wordLocation where urlId=%d" %u[0]).fetchone()
            if v != None: return  True
        else:
            return  False
    def addLinkRef(self,urlFrom,urlTo,linkText):
        pass
    def crawl(self,pages,depth=3):
        for i in range(depth):
            newpages=set()
            for page in pages:
                try:
                    c = urllib2.urlopen(page)
                except:
                    print "could not open %s" %page
                soup = BeautifulSoup(c.read(),'lxml')
                self.addToIndex(page,soup)

                links = soup('a')
                for link in links:
                    if ('href' in dict(link.attrs)):
                        url = urljoin(page,link['href'])
                        if url.find("'") != -1: continue
                        url = url.split('#')[0]
                        if url[0:4] == "http" and not self.isIndexed(url):
                            newpages.add(url)
                        linkText = self.getTextOnly(link)
                        self.addLinkRef(page,url,linkText)
                self.dbCommit()
            pages = newpages

class searcher:
    def __init__(self):
        self.db = sqlite3.connect("search.db")
    def __del__(self):
        self.db.close()
    def getMatchRows(self,q):
        fieldList='w0.urlID'
        tableList=''
        clauseList=''
        wordids=[]
        words = q.split(' ')
        tableNumber = 0
        for word in words:
            wordRow = self.db.execute("select rowid from wordList where word='%s'" %word).fetchone()
            if wordRow != None:
                wordId = wordRow[0]
                wordids.append(wordId)
                if tableNumber>0:
                    tableList+=','
                    clauseList += ' and '
                    clauseList += 'w%d.urlID=w%d.urlID and ' %(tableNumber-1,tableNumber)
                fieldList+=',w%d.location' %tableNumber
                tableList+= 'wordLocation w%d' %tableNumber
                clauseList+='w%d.wordID=%d' %(tableNumber,wordId)
                tableNumber+=1
        #example: select w0.urlID,w0.location,w1.location from wordLocation w0,wordLocation w1 where w0.wordID=2 and w0.urlID=w1.urlID and w1.wordID=37
        fullQuery = 'select %s from %s where %s' %(fieldList,tableList,clauseList)
        print  fullQuery
        cur = self.db.execute(fullQuery)
        rows = [row for row in cur]
        #urlid  word1position  word2postion
        return  rows,wordids
    def getScoredList(self,rows,wordids):
        totalScores = dict([(row[0],0) for row in rows])
        weights = [(1.0,self.frequencyScores(rows))]
        for (weight,scores) in weights:
            for url in totalScores:
                totalScores[url] += weight*scores[url]
        return  totalScores
    def  getUrlName(self,id):
        return  self.db.execute('select url from urlList where rowid=%d' %id).fetchone()[0]
    def query(self,q):
        rows,wordids = self.getMatchRows(q)
        scores = self.getScoredList(rows,wordids)
        randkedScores = sorted([(score,url) for (url,score) in scores.items()],reverse=1)
        for (score,urlid) in randkedScores:
            print '%f %s' %(score,self.getUrlName(urlid))
    def normallizeScores(self,scores,smallIsBetter=0):
        vsmall = 0.00001
        if smallIsBetter :
            minScore = min(scores.values())
            return  dict([(u,float(minScore)/max(vsmall,l)) for (u,l) in scores.items()])
        else:
            maxScore = max(scores.values())
            if maxScore == 0:maxScore = vsmall
            return dict([(u,float(c)/maxScore) for (u,c) in scores.items()])
    def  frequencyScores(self,rows):
        counts = dict([(row[0],0) for row in rows])
        for row in rows: counts[row[0]] += 1
        return  self.normallizeScores(counts)

if __name__ == '__main__':
    crawer = crawker()
    #crawer.createIndexTable()
    #crawer.crawl(["http://www.cnblogs.com/huxi/archive/2010/07/04/1771073.html"])
    se = searcher();
    result = se.query("python xml")
    print  result

参考书籍

集体智慧编程

追寻梦土

关注

2
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
搜索引擎原理简介和简单实现

搜索引擎原理简介和简单实现搜索引擎两个模块 1 数据收集 2 信息提取目录搜索引擎原理简介和简单实现目录数据收集信息提取Demo代码参考书籍数据收集数据收集使用爬虫算法。将爬虫素搜索的所有的的内容切分成单词，然后将单词和网址的关系存放在数据库中。编写爬虫可以使用urllib2提取网页信息可以使用beautifulsoup 存储网页信息可以使用sqlite.信息提取数据查根据关键
复制链接

扫一扫