from BeautifulSoup import *
from urlparse import urljoin
ignaorewords=set(['the','of','to','and','a','in','is','it'])
我們的搜索引擎基於關鍵詞, 所以將連詞,冠詞忽略
下面的代碼是爬蟲, 將網頁的文本數據存儲到我們的sqlite中, 大家看不懂也沒有關系, 知道這些函數是干什么的就行了
from sqlite3 import dbapi2 as sqlite
import urllib2
class crawler:
def __init__(self,dbname):
self.con=sqlite.connect(dbname)
#連接並建立數據庫, dbname 隨意, 'xxx.db'就可以
def __del__(self):
self.con.close()
def dbcommit(self):
self.con.commit()
def getentryid(self,table,field,value,createnew=True):
cur=self.con.execute(
"select rowid from %s where %s='%s'" %(table,field,value))
res=cur.fetchone()
if res==None:
cur=self.con.execute(
"insert into %s (%s) values ('%s')" % (table,field,value))
return cur.lastrowid
else:
return res[0]
def addtoindex(self,url,soup):
if self.isindexed(url): return
print 'Indexing',url
#Get words
text=self.gettextonly(soup)
words=self.separatewords(text)
#Get URL id
urlid=self.getentryid('urllist','url',url)
# Link word to url
for i in range(len(words)):
word=words[i]
if word in ignaorewords: continue
wordid=self.getentryid('wordlist','word',word)
self.con.execute("insert into wordlocation(urlid,wordid,location) \
values(%d,%d,%d)" % (urlid,wordid,i))
def gettextonly(self,soup):
v=soup.string
if v==None: