搜索引擎两个模块 1 数据收集 2 信息提取
数据收集
数据收集使用爬虫算法。将爬虫素搜索的所有的的内容切分成单词,然后将单词和网址的关系存放在数据库中。
- 编写爬虫可以使用urllib2
- 提取网页信息可以使用beautifulsoup
- 存储网页信息可以使用sqlite.
信息提取
数据查根据关键词,查找符合条件的URL。然后定义根据评估算法,计算URL的排名,排名最靠前的排在第一位。排名算法衡量的因子有以下几种:
- 单词频度
- 单词出现的位置(一般靠前的都是主题)
- 单词之间的距离 网页被引用的次数
- pageRank(用户点击任意次数到达这个网页的可达性,网页初始化因子可以相同,经过几次迭代,慢慢趋近合理值)
- 链接文本。
Demo代码
这个搜索引擎只是一个简单的demo,用来展现基本搜索引擎的原理,百度和谷歌最大的难点在于海量数据处理。
import urllib2
from bs4 import BeautifulSoup
from urlparse import urljoin
import sqlite3
import re
ignoreWord =set(['the','of','to','and','it','in'])
class crawker:
def __init__(self):
self.db = sqlite3.connect("search.db")
def __del__(self):
self.db.close()
def dbCommit(self):
self.db.commit()
def createIndexTable(self):
self.db.execute('create table if not exists urlList(url)')
self.db.execute('create table if not exists wordList(word)')
self.db.execute('create table if not exists wordLocation(urlID,wordID,location)')
self.db.execute('create table if not exists link(fromId interger,toId INTEGER)')
self.db.execute('create table if not exists linkWords(word,link)')
def getEnteryId(self,table,field,value,createnew=True):
cur = self.db.execute("select rowid from %s where %s='%s'"%(table,field,value))
res = cur.fetchone()
if res == None:
cur = self.db.execute('insert into %s(%s) VALUES("%s")' %(table,field,value))
return cur.lastrowid
else:
return res[0]
def addToIndex(self,url,soup):
if self.isIndexed(url): return
print 'Indexing %s' %url
text = self.getTextOnly(soup)
words = self.separate(text)
urlid = self.getEnteryId("urlList","url",url)
for i in range(len(words)):
word = words[i]
if word in ignoreWord:continue
wordid = self.getEnteryId('wordList','word',word)
self.db.execute('insert into wordLocation VALUES(%d,%d,%d)' %(urlid,wordid,i))
def getTextOnly(self,soup):
v = soup.string
if v == None:
c = soup.contents
results = ''
for t in c:
subText = self.getTextOnly(t)
results = results + subText + '\n'
return results
else:
return v.strip()
def separate(self,text):
splitter = re.compile('\\W*')
return [s.lower() for s in splitter.split(text) if s!=""]
def isIndexed(self,url):
u = self.db.execute("select rowid from urlList where url='%s'" %url).fetchone()
if u != None:
v = self.db.execute("select rowid from wordLocation where urlId=%d" %u[0]).fetchone()
if v != None: return True
else:
return False
def addLinkRef(self,urlFrom,urlTo,linkText):
pass
def crawl(self,pages,depth=3):
for i in range(depth):
newpages=set()
for page in pages:
try:
c = urllib2.urlopen(page)
except:
print "could not open %s" %page
soup = BeautifulSoup(c.read(),'lxml')
self.addToIndex(page,soup)
links = soup('a')
for link in links:
if ('href' in dict(link.attrs)):
url = urljoin(page,link['href'])
if url.find("'") != -1: continue
url = url.split('#')[0]
if url[0:4] == "http" and not self.isIndexed(url):
newpages.add(url)
linkText = self.getTextOnly(link)
self.addLinkRef(page,url,linkText)
self.dbCommit()
pages = newpages
class searcher:
def __init__(self):
self.db = sqlite3.connect("search.db")
def __del__(self):
self.db.close()
def getMatchRows(self,q):
fieldList='w0.urlID'
tableList=''
clauseList=''
wordids=[]
words = q.split(' ')
tableNumber = 0
for word in words:
wordRow = self.db.execute("select rowid from wordList where word='%s'" %word).fetchone()
if wordRow != None:
wordId = wordRow[0]
wordids.append(wordId)
if tableNumber>0:
tableList+=','
clauseList += ' and '
clauseList += 'w%d.urlID=w%d.urlID and ' %(tableNumber-1,tableNumber)
fieldList+=',w%d.location' %tableNumber
tableList+= 'wordLocation w%d' %tableNumber
clauseList+='w%d.wordID=%d' %(tableNumber,wordId)
tableNumber+=1
#example: select w0.urlID,w0.location,w1.location from wordLocation w0,wordLocation w1 where w0.wordID=2 and w0.urlID=w1.urlID and w1.wordID=37
fullQuery = 'select %s from %s where %s' %(fieldList,tableList,clauseList)
print fullQuery
cur = self.db.execute(fullQuery)
rows = [row for row in cur]
#urlid word1position word2postion
return rows,wordids
def getScoredList(self,rows,wordids):
totalScores = dict([(row[0],0) for row in rows])
weights = [(1.0,self.frequencyScores(rows))]
for (weight,scores) in weights:
for url in totalScores:
totalScores[url] += weight*scores[url]
return totalScores
def getUrlName(self,id):
return self.db.execute('select url from urlList where rowid=%d' %id).fetchone()[0]
def query(self,q):
rows,wordids = self.getMatchRows(q)
scores = self.getScoredList(rows,wordids)
randkedScores = sorted([(score,url) for (url,score) in scores.items()],reverse=1)
for (score,urlid) in randkedScores:
print '%f %s' %(score,self.getUrlName(urlid))
def normallizeScores(self,scores,smallIsBetter=0):
vsmall = 0.00001
if smallIsBetter :
minScore = min(scores.values())
return dict([(u,float(minScore)/max(vsmall,l)) for (u,l) in scores.items()])
else:
maxScore = max(scores.values())
if maxScore == 0:maxScore = vsmall
return dict([(u,float(c)/maxScore) for (u,c) in scores.items()])
def frequencyScores(self,rows):
counts = dict([(row[0],0) for row in rows])
for row in rows: counts[row[0]] += 1
return self.normallizeScores(counts)
if __name__ == '__main__':
crawer = crawker()
#crawer.createIndexTable()
#crawer.crawl(["http://www.cnblogs.com/huxi/archive/2010/07/04/1771073.html"])
se = searcher();
result = se.query("python xml")
print result
参考书籍
- 集体智慧编程