- 主机环境:Ubuntu 13.04
- Python版本:2.7.4
- Django版本:1.5.4
- Scrapy版本:0.18.2
- ElasticSearch版本:0.90.5
原创作品,转载请标明:http://blog.yanming8.cn/archives/138
闲来无聊,查看了相关搜索引擎的基本知识,经过搜集资料,了解了搜索引擎所需要的基本子系统,爬取子系统,索引服务子系统,Web请求和应答子系统。然后经过学习基本的开源框架文档,集成的项目已经PUSH到GitHub。
首先查看基于开源的Scrapy爬虫框架编写的一个爬虫,爬取校园网的内容(主要是免流量)
04 | from scrapy.utils.url import urljoin_rfc |
05 | from scrapy.spider import BaseSpider |
06 | from scrapy.selector import HtmlXPathSelector |
07 | from scrapy.http import Request |
09 | from scrapy.exceptions import DropItem |
11 | from mymodules.items import Website |
16 | class Xidian_Spider(BaseSpider): |
26 |
self .allowed_domains = [ 'xidian.edu.cn' ] |
28 |
def parse( self , response): |
30 |
hxs = HtmlXPathSelector(response) |
32 |
refer_websites = hxs.select( '//@href' ).extract() |
38 |
item[ 'url' ] = response.url |
39 |
item[ 'title' ] = hxs.select( '/html/head/title/text()' ).extract()[ 0 ] |
43 |
list = hxs.select( '/html/body//*/text()' ).extract() |
52 |
for weburl in refer_websites: |
54 |
utf8_url = weburl.encode( 'utf-8' ) |
57 |
postfix = re. compile (r '.+\.((jpg)|(ico)|(rar)|(zip)|(doc)|(ppt)|(xls)|(css)|(exe)|(pdf))x?$' ) |
58 |
prefix = re. compile (r '^((javascript:)|(openapi)).+' ) |
60 |
if postfix.match(utf8_url): |
62 |
if prefix.match(utf8_url): |
64 |
if not utf8_url.startswith( 'http://' ): |
66 |
weburl = 'http://' + self .gethostname(response.url) + '/' + weburl |
68 |
weburl = re.sub(r '/\.\./\.\./' ,r '/' ,weburl) |
69 |
weburl = re.sub(r '/\.\./' ,r '/' ,weburl) |
71 |
yield Request(weburl, callback = self .parse) |
73 |
def gethostname( self , res_url): |
75 |
proto, rest = urllib.splittype(res_url) |
76 |
host, rest = urllib.splithost(rest) |
爬取得到的ITEM会交给PIPELINE处理。
这里的PipeLine做了去重处理,不能简单的放在内容,所以使用的是Bloom Filter的算法,这里直接安装了Python的开源库中的pybloomfilter(有时间研究一下)
01 | class DuplicatesPipeline( object ): |
04 |
self .bf = BloomFilter( 10000000 , 0.01 , 'filter.bloom' ) |
05 |
self .f_write = open ( 'visitedsites' , 'w' ) |
09 |
def process_item( self , item, spider): |
10 |
print '************%d pages visited!*****************' % len ( self .bf) |
11 |
if self .bf.add(item[ 'url' ]): |
12 |
raise DropItem( "Duplicate item found: %s" % item) |
15 |
self .save_to_file(item[ 'url' ],item[ 'title' ]) |
16 |
self .si.AddIndex(item) |
19 |
def save_to_file( self ,url,utitle): |
20 |
self .f_write.write(url) |
21 |
self .f_write.write( '\t' ) |
22 |
self .f_write.write(utitle.encode( 'utf-8' )) |
23 |
self .f_write.write( '\n' ) |
该类中的SearchIndex是ElasticSearch建立索引的类。定义如下:
06 | from mymodules.items import Website |
07 | INDEX_NAME = 'xidian_spider' |
09 | class SearchIndex( object ): |
12 |
self .conn = ES( '127.0.0.1:9200' , timeout = 3.5 ) |
14 |
self .conn.delete_index(INDEX_NAME) |
18 |
self .conn.create_index(INDEX_NAME) |
21 |
mapping = {u 'content' : { 'boost' : 1.0 , |
26 |
"searchAnalyzer" : "ik" , |
27 |
"term_vector" : "with_positions_offsets" }, |
28 |
u 'title' : { 'boost' : 1.0 , |
33 |
"searchAnalyzer" : "ik" , |
34 |
"term_vector" : "with_positions_offsets" }, |
41 |
"term_vector" : "with_positions_offsets" }, |
44 |
self .conn.put_mapping( "searchEngine-type" , { 'properties' :mapping}, [INDEX_NAME]) |
46 |
def AddIndex( self ,item): |
48 |
print 'Adding Index item URL %s' % item[ 'title' ].encode( 'utf-8' ) |
49 |
self .conn.index({ 'title' :item[ 'title' ].encode( 'utf-8' ), \ |
50 |
'url' :item[ 'url' ].encode( 'utf-8' ),\ |
51 |
'content' :item[ 'content' ].encode( 'utf-8' )\ |
52 |
},INDEX_NAME, 'searchEngine-type' ) |
55 |
self .conn.default_indices = [INDEX_NAME] |
其中中文分词使用的是IK分词,Python库中直接安装即可。
Django中接受搜索请求的处理函数如下:
06 |
if 'page' in request.GET: |
07 |
page = unicode (request.GET[ 'page' ]) |
11 |
results = dosearch(q,page) |
13 |
return render(request, 'res_search.html' , { 'results' : results, |
18 |
'nextpage' : int (page) + 1 }) |
20 |
message = 'You submitted an empty form.' |
21 |
return HttpResponse(message) |
其中调用dosearch函数进行连接ES查询,函数内容如下:
01 | def dosearch(string,upage): |
02 |
conn = ES( '127.0.0.1:9200' , timeout = 3.5 ) |
03 |
fq_title = FieldQuery(analyzer = 'ik' ) |
04 |
fq_title.add( 'title' ,string) |
06 |
fq_content = FieldQuery(analyzer = 'ik' ) |
07 |
fq_content.add( 'content' ,string) |
09 |
bq = BoolQuery(should = [fq_title,fq_content]) |
11 |
h = HighLighter([ '[' ], [ ']' ], fragment_size = 100 ) |
13 |
page = int (upage.encode( 'utf-8' )) |
17 |
s = Search(bq,highlight = h,start = (page - 1 ) * PAGE_SIZE,size = PAGE_SIZE) |
18 |
s.add_highlight( "content" ) |
19 |
s.add_highlight( 'title' ) |
20 |
results = conn.search(s,indices = 'xidian_spider' ,doc_types = 'searchEngine-type' ) |
24 |
if (r._meta.highlight.has_key( "title" )): |
25 |
r[ 'title' ] = r._meta.highlight[u "title" ][ 0 ] |
26 |
if (r._meta.highlight.has_key( 'content' )): |
27 |
r[ 'content' ] = r._meta.highlight[u 'content' ][ 0 ] |
30 |
res.content = r[ 'content' ] |