最近学长布置了一个自建搜索引擎的学习任务,被网上的代码坑过了,所以将自己的学习成果分享出来。
首先我个人做的是一个站内静态搜索引擎,以新浪网为初始url进行广度优先搜索得到500个网页,保存之后你便可以建立起以这500个网页为基础的搜索引擎。
#! /usr/bin/env python
# -*- coding: utf-8 -*-
from spdUtility import PriorityQueue,Parser
import chardet
import urllib2
import sys
import os
import inspect
import time
g_url = 'http://www.sina.com.cn'
g_key = 'www.sina'
"""
def line():
try:
raise Exception
except:
return sys.exc_info()[2].tb_frame.f_back.f_lineno"""
def updatePriQueue(priQueue, url):
extraPrior = url.endswith('.html') and 2 or 0
extraMyBlog = g_key in url and 5 or 0
item = priQueue.getitem(url)#一种读取方式
if item:
newitem = (item[0]+1+extraPrior+extraMyBlog, item[1])
priQueue.remove(item)
priQueue.push(newitem)
else :
priQueue.push( (1+extraPrior+extraMyBlog,url) )
def getmainurl(url):#获得该url的主站地址,用于添加在相对url地址的开头,即去除除http://之外的“/”
ix = url.find('/',len('http://') )
if ix > 0 :
return url[:ix]
else :
return url
def analyseHtml(url, html, priQueue, downlist):
p = Parser()
try :
p.feed(html)
p.close()
except:
return
mainurl = getmainurl(url)
print mainurl
for (k, v) in p.anchors.items():
for u in v :
if not u.startswith('http://'):
u = mainurl + u
if not downlist.count(u):
updatePriQueue( priQueue, u)
def downloadUrl(id, url, priQueue, downlist,downFolder):
downFileName = downFolder+'/%d.html' % (id,)
print 'downloading', url, 'as', downFileName, time.ctime(),
try:
fp = urllib2.urlopen(url)
except:
print '[ failed ]'
return False
else :
print '[ success ]'
downlist.push( url )
op = open(downFileName, "wb")
html = fp.read()
op.write( html )
op.close()
fp.close()
analyseHtml(url, html, priQueue, downlist)
return True
def spider(beginurl, pages, downFolder):
priQueue = PriorityQueue()
downlist = PriorityQueue()
priQueue.push( (1,beginurl) )
i = 0
while not priQueue.empty() and i < pages :
k, url = priQueue.pop()
if downloadUrl(i+1, url, priQueue , downlist, downFolder):
i += 1
print '\nDownload',i,'pages, Totally.'
def main():
beginurl = g_url
pages = 500
downloadFolder = './spiderDown'
if not os.path.isdir(downloadFolder):
os.mkdir(downloadFolder)
spider( beginurl, pages, downloadFolder)
if __name__ == '__main__':
main()
这个是爬虫程序,大部分来自网络,但是其实并不好用,本人已经对其做了修改,性能稍微提高了一点。
下面是其中的类:
#! /usr/bin/env python
# -*- coding: utf-8 -*-
import bisect
import string
import htmllib
import formatter
class PriorityQueue(list):
def __init__(self):
list.__init__(self)
self.map = {}
def push(self, item):
if self.count(item) == 0:
bisect.insort(self, item)
self.map[ item[1] ] = item
def pop(self):
r = list.pop(self)
del self.map[ r[1] ]
return r
def getitem(self,url):
if self.map.has_key( url ):
return self.map[url]
else :
return None
def empty(self):
return len(self) == 0
def remove(self,item):
list.remove(self, item)
del self.map[ item[1] ]
def count(self,item):
if len(self) == 0 :
return 0
#二分查找
left = 0
right = len(self)-1
mid = -1
while left <= right:
mid = (left+right)/2
if self[mid] < item :
left = mid + 1
elif self[mid] > item :
right = mid -1
else:
break
return self[mid] == item and 1 or 0
class Parser(htmllib.HTMLParser):
# HTML分析类
def __init__(self, verbose=0):
self.anchors = {}
f = formatter.NullFormatter()
htmllib.HTMLParser.__init__(self, f, verbose)
def anchor_bgn(self, href, name, type):
self.save_bgn()
self.anchor = href
def anchor_end(self):
text = string.strip(self.save_end())
if self.anchor and text:
self.anchors[text] = self.anchors.get(text, []) + [self.anchor]
def main(): #just for test
pq = PriorityQueue()
# add items out of order
pq.push( (1,'http://www.baidu.com') )
pq.push( (2,'http://www.sina.com') )
pq.push( (3,'http://www.google.com') )
pq.push( (1,'http://www.163.com') )
item = pq.getitem('http://www.sina.com')
print item
print pq.count(item)
pq.remove( item )
print pq.count(item)
# print queue contents
while not pq.empty():
print pq.pop()
if __name__ == '__main__':
main()
下面的工作就是除去html里面的大量标签,依旧参考网络,但是经过实际操作就知道,会存在编码问题
#! /usr/bin/env python
# -*- coding: utf-8 -*-
import chardet
import re
import sys
import os
def PreProcess():#去除网页标签
global g_HTML
_doctype = re.compile(r'<!DOCTYPE.*?>', re.I|re.S)
_comment = re.compile(r'<!--.*?-->', re.S)
_javascript = re.compile(r'<script.*?>.*?<\/script>', re.I|re.S)
_css = re.compile(r'<style.*?>.*?<\/style>', re.I|re.S)
_dd=re.compile((r'<dd>.*?<\/dd>,re.I|re.S'))
_other_tag = re.compile(r'<.*?>', re.S|re.I)
_special_char = re.compile(r'&.{1,5};|&#.{1,5};')
g_HTML = _doctype.sub(' ', g_HTML)
g_HTML = _comment.sub(' ', g_HTML)
g_HTML = _javascript.sub(' ', g_HTML)
g_HTML = _css.sub(' ', g_HTML)
g_HTML = _other_tag.sub(' ', g_HTML)
g_HTML=_dd.sub(' ',g_HTML)
g_HTML = _special_char.sub(' ', g_HTML)
def GetContent():
global g_HTMLBlock
nMaxSize = len(g_HTMLBlock)
nBegin = 0
nEnd = nMaxSize
for i in range(0, nMaxSize):
if g_HTMLBlock[i]>=0 and i+3<nMaxSize and g_HTMLBlock[i+1]>=0 and g_HTMLBlock[i+2]>=0 and g_HTMLBlock[i+3]>=0:
nBegin = i
break
return ' '.join(g_HTMLLine[nBegin:nEnd+1])
if __name__ == '__main__':
rootdir = "e:/my/newspider/spiderDown"
for parent,dirnames,filenames in os.walk(rootdir): #三个参数:分别返回1.父目录 2.所有文件夹名字(不含路径) 3.所有文件名字
for filename in filenames:
f1=open(filename)
f=open(filename)
fencoding=chardet.detect(f1.read())
print fencoding
"""f = file('spiderDown/79.html', 'r')
f1=file('spiderDown/79.html','r')
fencoding=chardet.detect(f1.read())
print fencoding"""
if fencoding['encoding']=='utf-8'or fencoding['encoding']=='UTF-8':
global g_HTML
global g_HTMLLine
global g_HTMLBlock
global g_HTMLcontent
"""print filename"""
name=os.path.splitext(filename)[0]
print name
g_HTML = f.read()
PreProcess()
g_HTMLLine = [i.strip() for i in g_HTML.splitlines()] #先分割成行list,再过滤掉每行前后的空字符
HTMLLength = [len(i) for i in g_HTML] #计算每行的长度
g_HTMLBlock = [HTMLLength[i] + HTMLLength[i+1] + HTMLLength[i+2] for i in range(0, len(g_HTMLLine)-3)] #计算每块的长度
g_HTMLcontent=GetContent()
f3=open(name+'.txt','w')
f3.write(g_HTMLcontent)
f3.close()
这是参考的来源:
http://www.cnblogs.com/favourmeng/archive/2012/09/20/2695514.html
如果大家使用上面网站里的代码,就会出现好多乱码,即编码问题。我的处理方法比较暴力,哪里害病,就宰了哪里,把除了‘utf-8’之外的编码全部去掉,这里大家注意一下chardet这个函数模块,下载网址在这里:https://pypi.python.org/pypi/chardet
下载下来的文件后缀名居然是.alz,活这么大没见过,大家要去下载一个ALZIP,百度上就有,让后将解压的文件夹放入自己的python\Lib\site-packages文件夹里就可以了
然后就是分词拉,推荐大家用结巴分词,下载地址是:http://www.oschina.net/p/jieba?fromerr=VUOmZ4vh,貌似安装的时候需要用cmd命令行,至少我当时是这样的,下面的模型非常简单,说白了就是一个字典,键是词本身,值是出现的网站,我本来以为很简单的,就直接用python里的dict去写,结果dict无法写入文本文件,我大python啊,所以没办法转为str存入,代码如下:
#encoding=utf-8
# -*- coding: utf-8 -*-
import jieba
import re
import os
rootdir = os.getcwd()
print('rootdir = ' + rootdir)
all_word={}
for (dirpath, dirnames, filenames) in os.walk(rootdir):
for filename in filenames:
name=os.path.splitext(filename)[0]
name2=os.path.splitext(filename)[1]
if name2=='.txt' and name!='dictionnary':
f1=open(name+'.txt','r+')
all_txt=f1.read()
seg_list = jieba.cut(all_txt,cut_all=True)
for word in seg_list:
if all_word.get(word)==None:
all_word[word]=filename
elif filename in all_word[word]:
continue
else:
all_word[word]=all_word[word]+' '+filename
f1.close()
dictionnary=str(all_word)
f=open('dictionnary.txt','r+')
f.write(dictionnary)
f.close()
到了这一步,基本上就没什么了,只需要再写一个进行搜索的程序就好了。当然,如果你觉得太简单了,可以进行优化,譬如算出某个网页内的词的出现频率,这样在输出结果的时候排在前面的与搜索内容的关联性将会更大。