这一篇实现的功能是:对下载的网页内容进一步提取内容,如网页编码类型,标题等。
#!/usr/bin/env python
# -*- coding: GBK -*-
'''
Created on Jul 17, 2013
@author: belong
'''
import urllib
import re
class Tools:
def write_log(self, level, info):
print '[' + level + ']' + info
def match_regex(self,pattern,src):
result = ""
com = re.compile(pattern,re.I)
matchers = com.findall(src)
print matchers
for matcher in matchers:
result += matcher +" "
return result.strip()#strip函数返回去掉两端空格的字符串
class Crawler:
#获取一个url链接的所有urls
def get_url(self, url):
html = urllib.urlopen(url)
# pattern = re.compile("http://.*?\.com",re.I)
pattern = re.compile(r'[a-zA-z]+://[^\s]*\.html?', re.I)
while True:
data = html.read()
if data:
urls = pattern.findall(data)
else:
break
html.close()
return urls
#下载url对应网页
def download_url(self, url, filename):
Tools().write_log("info","开始下载")
html = urllib.urlopen(url)
f = open(filename, 'w')
while True:
page = html.read()
if page:
f.write(page);
else:
break
html.close()
f.close()
Tools().write_log("info","网页下载成功")
return page
#广度优先遍历
def broad_traverse(self, start_url, number):
Tools().write_log("info","开始遍历")
visited = []
unvisited = []
unvisited.append(start_url)
while len(unvisited):
if len(visited) < number:
url = unvisited.pop(0);
print url, len(visited)
i = len(visited)
# self.download_url(url, str(i) + '.html')
visited.append(url)
url_list = self.get_url(url)
for eachlink in url_list:
if ((unvisited.count(eachlink) == 0) & (visited.count(eachlink) == 0)):
unvisited.append(eachlink)
else:
break
Tools().write_log("info","遍历成功")
return visited
def main(self):
start_url = "http://www.baidu.com"
self.broad_traverse(start_url, 10)
#数据提取类
class DataExtractor:
#提取标题
def get_title(self,data):
title = Tools().match_regex('<title>.*?<\/title>', data)
if title == "":
Tools().write_log( "Error", "标题匹配不成功")
return title
#提取内容格式
def get_type(self,data):
type = Tools().match_regex('<meta.*content=.*?\/>', data)
return type
#提取字符类型
def get_charset(self,data):
charset = Tools().match_regex('<meta.*?charset=.*?\/>', data)
return charset
def get_info(self,url):
try:
data = Crawler().download_url(url,str(11)+'.html')
except:
Tools().write_log("error", url+"抓取失败")
raise
Tools().write_log("info", "开始数据匹配")
rst = {}#rst是一个字典
rst['title'] = self.get_title(data)
print "title:",rst['title']
rst['type'] = self.get_type(data)
print "type:",rst['type']
rst['charset'] = self.get_charset(data)
print "charset:",rst['charset']
Tools().write_log("DEBUG",'title=%s,type=%s,type=%s'%(rst['title'],rst['type'],rst['charset']))
return rst
def main():
start_url = "http://www.baidu.com"
crawler = Crawler()
url_list = crawler.broad_traverse(start_url, 10)
dataExtractor= DataExtractor()
for url in url_list:
dataExtractor.get_info(url)
main()