搜索引擎–基于Django/Scrapy/ElasticSearch的搜索引擎的实现

最新推荐文章于 2022-06-03 13:20:29 发布

iteye_7527

最新推荐文章于 2022-06-03 13:20:29 发布

阅读量400

点赞数

文章标签： python 大数据 javascript ViewUI

主机环境：Ubuntu 13.04
Python版本：2.7.4
Django版本：1.5.4
Scrapy版本：0.18.2
ElasticSearch版本：0.90.5

原创作品，转载请标明：http://blog.yanming8.cn/archives/138

闲来无聊，查看了相关搜索引擎的基本知识，经过搜集资料，了解了搜索引擎所需要的基本子系统，爬取子系统，索引服务子系统，Web请求和应答子系统。然后经过学习基本的开源框架文档，集成的项目已经PUSH到GitHub。

首先查看基于开源的Scrapy爬虫框架编写的一个爬虫，爬取校园网的内容（主要是免流量）

 
   01#!/usr/bin/env python
 
   02#-*- coding:utf-8 -*-
 
   03#from urlparse import urljoin
 
fromscrapy.utils.urlimporturljoin_rfc 
 
fromscrapy.spiderimportBaseSpider 
 
fromscrapy.selectorimportHtmlXPathSelector 
 
fromscrapy.httpimportRequest 
 
   08
 
fromscrapy.exceptionsimportDropItem 
 
   10
 
frommymodules.itemsimportWebsite 
 
   12
 
importurllib 
 
importre 
 
   15
 
classXidian_Spider(BaseSpider): 
 
name="xidian_spider" 
 
start_urls=[ 
 
"http://www.xidian.edu.cn", 
 
#"http://rs.xidian.edu.cn/forum.php", 
 
   21
 
] 
 
   23
 
def__init__(self): 
 
"""init the allowed_domain""" 
 
self.allowed_domains=['xidian.edu.cn'] 
 
   27
 
defparse(self, response): 
 
"""In this parse,we use double yeild to return the item or Request""" 
 
hxs=HtmlXPathSelector(response) 
 
   31
 
refer_websites=hxs.select('//@href').extract() 
 
   33
 
#if not self.gethostname(response.url) in self.allowed_domains: 
 
# self.allowed_domains.append(self.gethostname(response.url)) 
 
   36
 
item=Website() 
 
item['url']=response.url 
 
item['title']=hxs.select('/html/head/title/text()').extract()[0] 
 
   40
 
"""FIXME:This XPath select all the elements,include the javascript code.BAD!!""" 
 
str='' 
 
list=hxs.select('/html/body//*/text()').extract() 
 
forsinlist: 
 
str+=s.strip() 
 
str+=' ' 
 
   47
 
item['content']=str 
 
   49
 
yielditem 
 
   51
 
forweburlinrefer_websites: 
 
   53
 
utf8_url=weburl.encode('utf-8') 
 
   55
 
"""The following regex to match the prefix and postfix of urls""" 
 
postfix=re.compile(r'.+\.((jpg)|(ico)|(rar)|(zip)|(doc)|(ppt)|(xls)|(css)|(exe)|(pdf))x?$') 
 
prefix=re.compile(r'^((javascript:)|(openapi)).+') 
 
   59
 
ifpostfix.match(utf8_url): 
 
continue 
 
ifprefix.match(utf8_url): 
 
continue 
 
ifnotutf8_url.startswith('http://'): 
 
#weburl = urljoin_rfc(response.url, weburl, response.encoding) 
 
weburl='http://'+self.gethostname(response.url)+'/'+weburl 
 
   67
 
weburl=re.sub(r'/\.\./\.\./',r'/',weburl) 
 
weburl=re.sub(r'/\.\./',r'/',weburl) 
 
   70
 
yieldRequest(weburl, callback=self.parse) 
 
   72
 
defgethostname(self, res_url): 
 
"""get the host name of a url""" 
 
proto, rest=urllib.splittype(res_url) 
 
host, rest=urllib.splithost(rest) 
 
returnhost

爬取得到的ITEM会交给PIPELINE处理。

这里的PipeLine做了去重处理，不能简单的放在内容，所以使用的是Bloom Filter的算法，这里直接安装了Python的开源库中的pybloomfilter（有时间研究一下）

 
classDuplicatesPipeline(object): 
 
   02
 
def__init__(self): 
 
self.bf=BloomFilter(10000000,0.01,'filter.bloom') 
 
self.f_write=open('visitedsites','w') 
 
self.si=SearchIndex() 
 
self.si.SearchInit() 
 
   08
 
defprocess_item(self, item, spider): 
 
print'************%d pages visited!*****************'%len(self.bf) 
 
ifself.bf.add(item['url']):#True if item in the BF 
 
raiseDropItem("Duplicate item found: %s"%item) 
 
else: 
 
#print '%d pages visited!'% len(self.url_seen) 
 
self.save_to_file(item['url'],item['title']) 
 
self.si.AddIndex(item) 
 
returnitem 
 
   18
 
defsave_to_file(self,url,utitle): 
 
self.f_write.write(url) 
 
self.f_write.write('\t') 
 
self.f_write.write(utitle.encode('utf-8')) 
 
self.f_write.write('\n') 
 
   24
 
def__del__(self): 
 
"""docstring for __del__""" 
 
self.f_write.close() 
 
self.si.IndexDone()

该类中的SearchIndex是ElasticSearch建立索引的类。定义如下：

 
   01#!/usr/bin/env python
 
   02#-*- coding:utf-8-*-
 
importos 
 
importsys 
 
frompyesimport* 
 
frommymodules.itemsimportWebsite 
 
INDEX_NAME='xidian_spider' 
 
   08
 
classSearchIndex(object): 
 
   10
 
defSearchInit(self): 
 
self.conn=ES('127.0.0.1:9200', timeout=3.5)#Connect to ES 
 
try: 
 
self.conn.delete_index(INDEX_NAME) 
 
#pass 
 
except: 
 
pass 
 
self.conn.create_index(INDEX_NAME)#Create a new INDEX 
 
   19
 
#Define the structure of the data format 
 
mapping={u'content': {'boost':1.0, 
 
'index':'analyzed', 
 
'store':'yes', 
 
'type': u'string', 
 
"indexAnalyzer":"ik", 
 
"searchAnalyzer":"ik", 
 
"term_vector":"with_positions_offsets"}, 
 
u'title': {'boost':1.0, 
 
'index':'analyzed', 
 
'store':'yes', 
 
'type': u'string', 
 
"indexAnalyzer":"ik", 
 
"searchAnalyzer":"ik", 
 
"term_vector":"with_positions_offsets"}, 
 
u'url': {'boost':1.0, 
 
'index':'analyzed', 
 
'store':'yes', 
 
'type': u'string', 
 
#"indexAnalyzer":"ik", 
 
#"searchAnalyzer":"ik", 
 
"term_vector":"with_positions_offsets"}, 
 
} 
 
   43
 
self.conn.put_mapping("searchEngine-type", {'properties':mapping}, [INDEX_NAME])#Define the type 
 
   45
 
defAddIndex(self,item): 
 
   47
 
print'Adding Index item URL %s'%item['title'].encode('utf-8') 
 
self.conn.index({'title':item['title'].encode('utf-8'), \ 
 
'url':item['url'].encode('utf-8'),\ 
 
'content':item['content'].encode('utf-8')\ 
 
},INDEX_NAME,'searchEngine-type') 
 
   53
 
defIndexDone(self): 
 
self.conn.default_indices=[INDEX_NAME]#Set the default indices 
 
self.conn.refresh()#Refresh the ES

其中中文分词使用的是IK分词，Python库中直接安装即可。

Django中接受搜索请求的处理函数如下：

 
defsearch(request): 
 
"""docstring for search""" 
 
if'q'inrequest.GET: 
 
q=request.GET['q'] 
 
printq 
 
if'page'inrequest.GET: 
 
page=unicode(request.GET['page']) 
 
else: 
 
page=unicode(1) 
 
start=clock() 
 
results=dosearch(q,page)#connect to ES to return the results 
 
end=clock() 
 
returnrender(request,'res_search.html', {'results': results, 
 
'query':q, 
 
'count':len(results), 
 
'time':end-start, 
 
'page':page, 
 
'nextpage':int(page)+1}) 
 
else: 
 
message='You submitted an empty form.' 
 
returnHttpResponse(message)

其中调用dosearch函数进行连接ES查询，函数内容如下：

 
defdosearch(string,upage): 
 
conn=ES('127.0.0.1:9200', timeout=3.5)#连接ES 
 
fq_title=FieldQuery(analyzer='ik') 
 
fq_title.add('title',string) 
 
   05
 
fq_content=FieldQuery(analyzer='ik') 
 
fq_content.add('content',string) 
 
   08
 
bq=BoolQuery(should=[fq_title,fq_content]) 
 
   10
 
h=HighLighter(['['], [']'], fragment_size=100) 
 
   12
 
page=int(upage.encode('utf-8')) 
 
ifpage <1: 
 
page=1 
 
   16
 
s=Search(bq,highlight=h,start=(page-1)*PAGE_SIZE,size=PAGE_SIZE) 
 
s.add_highlight("content") 
 
s.add_highlight('title') 
 
results=conn.search(s,indices='xidian_spider',doc_types='searchEngine-type') 
 
   21
 
list=[] 
 
forrinresults: 
 
if(r._meta.highlight.has_key("title")): 
 
r['title']=r._meta.highlight[u"title"][0] 
 
if(r._meta.highlight.has_key('content')): 
 
r['content']=r._meta.highlight[u'content'][0] 
 
   28
 
res=Results() 
 
res.content=r['content'] 
 
res.title=r['title'] 
 
res.url=r['url'] 
 
list.append(res) 
 
returnlist