自建搜索引擎

最近学长布置了一个自建搜索引擎的学习任务,被网上的代码坑过了,所以将自己的学习成果分享出来。

首先我个人做的是一个站内静态搜索引擎,以新浪网为初始url进行广度优先搜索得到500个网页,保存之后你便可以建立起以这500个网页为基础的搜索引擎。

#! /usr/bin/env python
# -*- coding: utf-8 -*-
from spdUtility import PriorityQueue,Parser
import chardet
import urllib2
import sys
import os
import inspect
import time
g_url = 'http://www.sina.com.cn'
g_key = 'www.sina'
"""
def line():
    try:
        raise Exception
    except:
        return sys.exc_info()[2].tb_frame.f_back.f_lineno"""
def updatePriQueue(priQueue, url):
    extraPrior = url.endswith('.html') and 2 or 0
    extraMyBlog = g_key in url and 5 or 0
    item = priQueue.getitem(url)#一种读取方式
    if item:
        newitem = (item[0]+1+extraPrior+extraMyBlog, item[1])
        priQueue.remove(item)
        priQueue.push(newitem)
    else :
        priQueue.push( (1+extraPrior+extraMyBlog,url) )
def getmainurl(url):#获得该url的主站地址,用于添加在相对url地址的开头,即去除除http://之外的“/”
    ix = url.find('/',len('http://') )
    if ix > 0 :
        return url[:ix]
    else :
        return url
def analyseHtml(url, html, priQueue, downlist):
    p = Parser()
    try :
        p.feed(html)
        p.close()
    except:
        return
    mainurl = getmainurl(url)
    print mainurl
    for (k, v) in p.anchors.items():
        for u in v :
            if not u.startswith('http://'):
                u = mainurl + u
            if not downlist.count(u):
                updatePriQueue( priQueue, u)
def downloadUrl(id, url, priQueue, downlist,downFolder):
    downFileName = downFolder+'/%d.html' % (id,)
    print 'downloading', url, 'as', downFileName, time.ctime(),
    try:
        fp = urllib2.urlopen(url)
    except:
        print '[ failed ]'
        return False
    else :
        print '[ success ]'
        downlist.push( url )
        op = open(downFileName, "wb")
        html = fp.read()
        op.write( html )
        op.close()
        fp.close()
        analyseHtml(url, html, priQueue, downlist)
        return True
def spider(beginurl, pages, downFolder):
    priQueue = PriorityQueue()
    downlist = PriorityQueue()
    priQueue.push( (1,beginurl) )
    i = 0
    while not priQueue.empty() and i < pages :
        k, url = priQueue.pop()
        if downloadUrl(i+1, url, priQueue , downlist, downFolder):
            i += 1
    print '\nDownload',i,'pages, Totally.'
def main():
    beginurl = g_url
    pages = 500
    downloadFolder = './spiderDown'
    if not os.path.isdir(downloadFolder):
        os.mkdir(downloadFolder)
    spider( beginurl, pages, downloadFolder)
if __name__ == '__main__':
    main()
这个是爬虫程序,大部分来自网络,但是其实并不好用,本人已经对其做了修改,性能稍微提高了一点。

下面是其中的类:

#! /usr/bin/env python
# -*- coding: utf-8 -*-
import bisect
import string
import htmllib
import formatter
class PriorityQueue(list):
    def __init__(self):
        list.__init__(self)
        self.map = {}
    def push(self, item):
        if self.count(item) == 0:
            bisect.insort(self, item)
            self.map[ item[1] ] = item
    def pop(self):
        r = list.pop(self)
        del self.map[ r[1] ]
        return r
    def getitem(self,url):
        if self.map.has_key( url ):
            return self.map[url]
        else :
            return None
    def empty(self):
        return len(self) == 0
    def remove(self,item):
        list.remove(self, item)
        del self.map[ item[1] ]
    def count(self,item):
        if len(self) == 0 :
            return 0
        #二分查找
        left = 0
        right = len(self)-1
        mid = -1
        while left <= right:
            mid = (left+right)/2
            if self[mid] < item :
                left = mid + 1
            elif self[mid] > item :
                right = mid -1
            else:
                break
        return self[mid] == item and 1 or 0
class Parser(htmllib.HTMLParser):
    # HTML分析类
    def __init__(self, verbose=0):
        self.anchors = {}
        f = formatter.NullFormatter()
        htmllib.HTMLParser.__init__(self, f, verbose)
    def anchor_bgn(self, href, name, type):
        self.save_bgn()
        self.anchor = href
    def anchor_end(self):
        text = string.strip(self.save_end())
        if self.anchor and text:
            self.anchors[text] = self.anchors.get(text, []) + [self.anchor]
def main(): #just for test
    pq = PriorityQueue()
    # add items out of order
    pq.push( (1,'http://www.baidu.com') )
    pq.push( (2,'http://www.sina.com') )
    pq.push( (3,'http://www.google.com') )
    pq.push( (1,'http://www.163.com') )
    item = pq.getitem('http://www.sina.com')
    print item
    print pq.count(item)
    pq.remove( item )
    print pq.count(item)
    # print queue contents
    while not pq.empty():
        print pq.pop()
if __name__ == '__main__':
    main()
下面的工作就是除去html里面的大量标签,依旧参考网络,但是经过实际操作就知道,会存在编码问题

#! /usr/bin/env python
# -*- coding: utf-8 -*-
import chardet
import re
import sys
import os
def PreProcess():#去除网页标签
    global g_HTML
    _doctype = re.compile(r'<!DOCTYPE.*?>', re.I|re.S)
    _comment = re.compile(r'<!--.*?-->', re.S)
    _javascript = re.compile(r'<script.*?>.*?<\/script>', re.I|re.S)
    _css = re.compile(r'<style.*?>.*?<\/style>', re.I|re.S)
    _dd=re.compile((r'<dd>.*?<\/dd>,re.I|re.S'))
    _other_tag = re.compile(r'<.*?>', re.S|re.I)
    _special_char = re.compile(r'&.{1,5};|&#.{1,5};')
    g_HTML = _doctype.sub(' ', g_HTML)
    g_HTML = _comment.sub(' ', g_HTML)
    g_HTML = _javascript.sub(' ', g_HTML)
    g_HTML = _css.sub(' ', g_HTML)
    g_HTML = _other_tag.sub(' ', g_HTML)
    g_HTML=_dd.sub(' ',g_HTML)
    g_HTML = _special_char.sub(' ', g_HTML)
def GetContent():
    global g_HTMLBlock
    nMaxSize = len(g_HTMLBlock)
    nBegin = 0
    nEnd = nMaxSize
    for i in range(0, nMaxSize):
        if g_HTMLBlock[i]>=0 and i+3<nMaxSize and g_HTMLBlock[i+1]>=0 and g_HTMLBlock[i+2]>=0 and g_HTMLBlock[i+3]>=0:
            nBegin = i
            break
    return ' '.join(g_HTMLLine[nBegin:nEnd+1])
if __name__ == '__main__':
    rootdir = "e:/my/newspider/spiderDown"
    for parent,dirnames,filenames in os.walk(rootdir):    #三个参数:分别返回1.父目录 2.所有文件夹名字(不含路径) 3.所有文件名字
        for filename in filenames:
            f1=open(filename)
            f=open(filename)
            fencoding=chardet.detect(f1.read())
            print fencoding
            """f = file('spiderDown/79.html', 'r')
            f1=file('spiderDown/79.html','r')
            fencoding=chardet.detect(f1.read())
            print fencoding"""
            if fencoding['encoding']=='utf-8'or fencoding['encoding']=='UTF-8':
                global g_HTML
                global g_HTMLLine
                global g_HTMLBlock
                global g_HTMLcontent
                """print filename"""
                name=os.path.splitext(filename)[0]
                print name
                g_HTML = f.read()
                PreProcess()
                g_HTMLLine = [i.strip() for i in g_HTML.splitlines()]    #先分割成行list,再过滤掉每行前后的空字符
                HTMLLength = [len(i) for i in g_HTML]    #计算每行的长度
                g_HTMLBlock = [HTMLLength[i] + HTMLLength[i+1] + HTMLLength[i+2] for i in range(0, len(g_HTMLLine)-3)]    #计算每块的长度
                g_HTMLcontent=GetContent()
                f3=open(name+'.txt','w')
                f3.write(g_HTMLcontent)
                f3.close()

这是参考的来源:
http://www.cnblogs.com/favourmeng/archive/2012/09/20/2695514.html

如果大家使用上面网站里的代码,就会出现好多乱码,即编码问题。我的处理方法比较暴力,哪里害病,就宰了哪里,把除了‘utf-8’之外的编码全部去掉,这里大家注意一下chardet这个函数模块,下载网址在这里:https://pypi.python.org/pypi/chardet

下载下来的文件后缀名居然是.alz,活这么大没见过,大家要去下载一个ALZIP,百度上就有,让后将解压的文件夹放入自己的python\Lib\site-packages文件夹里就可以了

然后就是分词拉,推荐大家用结巴分词,下载地址是:http://www.oschina.net/p/jieba?fromerr=VUOmZ4vh,貌似安装的时候需要用cmd命令行,至少我当时是这样的,下面的模型非常简单,说白了就是一个字典,键是词本身,值是出现的网站,我本来以为很简单的,就直接用python里的dict去写,结果dict无法写入文本文件,我大python啊,所以没办法转为str存入,代码如下:

#encoding=utf-8
# -*- coding: utf-8 -*-
import jieba
import re
import os
rootdir = os.getcwd()
print('rootdir = ' + rootdir)
all_word={}
for (dirpath, dirnames, filenames) in os.walk(rootdir):
    for filename in filenames:
        name=os.path.splitext(filename)[0]
        name2=os.path.splitext(filename)[1]
        if name2=='.txt' and name!='dictionnary':
            f1=open(name+'.txt','r+')
            all_txt=f1.read()
            seg_list = jieba.cut(all_txt,cut_all=True)
            for word in seg_list:
                if all_word.get(word)==None:
                    all_word[word]=filename
                elif filename in all_word[word]:
                    continue
                else:
                    all_word[word]=all_word[word]+'    '+filename
        f1.close()
dictionnary=str(all_word)
f=open('dictionnary.txt','r+')
f.write(dictionnary)
f.close()
到了这一步,基本上就没什么了,只需要再写一个进行搜索的程序就好了。当然,如果你觉得太简单了,可以进行优化,譬如算出某个网页内的词的出现频率,这样在输出结果的时候排在前面的与搜索内容的关联性将会更大。

  • 1
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值