项目需求和原型设计来自于:
《Python基础教程》第20章
类图:
相对于原版的代码,主要是把“Filter”的功能从Handler中独立出来了,使得功能模块更加清晰一点。
代码如下:
#encoding=utf-8
#by panda
import re
class filter():
'''为特定文本添加修饰性标签'''
patterns = {
'emphasis' : r'\*(.+?)\*',
'url' : r'(http://[\.a-zA-Z/]+)',
'mail' : r'([\.a-zA-Z]+@[\.a-zA-Z]+[a-zA-Z]+)',
}
def __init__(self, name):
self.name = name
self.pattern = self.patterns[self.name]
self.actionFunc = lambda matchobj: self.callback('sub_', name, matchobj) or matchobj.group(0)
def action(self, block):
return re.sub(self.pattern, self.actionFunc, block)
def callback(self, prefix, name, *args):
method = getattr(self, prefix+name, None)
if(callable(method)):
return method(*args)
else:
return None
def sub_emphasis(self, match):
return '<em>%s</em>' % match.group(1)
def sub_url(self, match):
return '<a href="%s">%s</a>' % (match.group(1), match.group(1))
def sub_mail(self,match):
return '<a href="mailto:%s">%s</a>' % (match.group(1), match.group(1))
class Rule():
'''为文本块添加标签'''
def __init__(self, handler):
self.handler = handler
def conditon(self,block):
self.block = block
return False
def action(self):
result = self.handler.start(self.type)
result += self.handler.feed(self.block)
result += self.handler.end(self.type)
print result
return result
class TitleRule(Rule):
"""
The title is the first block in the document, provided that it is
a heading.
"""
type = 'title'
first = True
def condition(self, block):
Rule.conditon(self,block)
if not self.first: return False
self.first = False
return not '\n' in block and len(block) <= 70 and not block[-1] == ':'
class ParagraphRule(Rule):
"""
A paragraph is simply a block that isn't covered by any of the
other rules.
"""
type = 'paragraph'
def condition(self, block):
Rule.conditon(self,block)
return True
class Handler():
def callback(self, prefix, name,*args):
method = getattr(self, prefix+name, None)
if callable(method):
return method(*args)
def start(self, name):
return self.callback('start_', name)
def end(self, name):
return self.callback('end_', name)
class HTMLRenderer(Handler):
def start_document(self):
return unicode('<html><head><title>文本标记结果</title></head><body>', 'utf-8').encode('gbk')
def end_document(self):
return '</body></html>'
def start_paragraph(self):
return '<p>'
def end_paragraph(self):
return '</p>'
def start_heading(self):
return '<h2>'
def end_heading(self):
return '</h2>'
def start_list(self):
return '<ul>'
def end_list(self):
return '</ul>'
def start_listitem(self):
return '<li>'
def end_listitem(self):
return '</li>'
def start_title(self):
return '<h1>'
def end_title(self):
return '</h1>'
def feed(self,block):
return block
class Parser():
'''文本分析器'''
def __init__(self):
self.filters = []
self.rules = []
def parse(self, content):
result = []
for block in content:
block = block.strip()
if (len(block) == 0):
continue;
for filter in self.filters:
block = filter.action(block)
for rule in self.rules:
if rule.condition(block):
last = rule.action()
if last:
block = last
break;
result.append(block)
return ''.join(result)
def addRule(self,rule):
self.rules.append(rule)
def addFilter(self,name):
self.filters.append(filter(name))
class BasicTextParser(Parser):
def __init__(self,handler):
Parser.__init__(self)
self.handler = handler
self.addFilter('emphasis')
self.addFilter('url')
self.addFilter('mail')
self.addRule(TitleRule(handler))
self.addRule(ParagraphRule(handler))
def parse(self, content):
result = []
result.append(self.handler.start('document'))
result.append(Parser.parse(self,content))
result.append(self.handler.end('document'))
return ''.join(result)
#import sys
if __name__ == '__main__':
handler = HTMLRenderer()
parser = BasicTextParser(handler)
f = open('test_input.txt')
f2 = open('test.html','w')
# p.parse(sys.stdin)
output = parser.parse(f)
print output
f2.write(output)
f2.close()
f.close()