详细理解 ---------- 即时标记

最新推荐文章于 2023-02-10 16:53:46 发布

芒果耶耶@

最新推荐文章于 2023-02-10 16:53:46 发布

阅读量211

点赞数

文章标签： python 过滤器 filter 正则表达式

本文链接：https://blog.csdn.net/weixin_36511801/article/details/108296718

版权

详细理解 ---------- 即时标记

composing.py

import sys
import re

###########################----Rules-----#################################
class Rule:
    '''
    调用指定对象的处理方法的方法：     action(self,block,handler)
    '''
    type = ''
    def action(self,block,handler):
        handler.start(self.type)
        handler.feed(block)
        handler.end(self.type)
        return True

class HeadingRule(Rule):
    '''
    自身类型：type
    判断规则及Boolen返回值方法：   condition(self,block)
    符合当前规则调用指定对象的处理方法的方法：     action(self,block,handler)
    '''
    type = 'heading'
    def condition(self,block):
        return not '\n' in block and len(block) <= 70 and not block[-1] == ':'

class TitleRule(HeadingRule):
    '''
    自身类型：type
    判断规则及Boolen返回值方法：   condition(self,block)
    符合当前规则调用指定对象的处理方法的方法：     action(self,block,handler)
    '''
    type = 'title'
    first = True
    def condition(self,block):
        if not self.first:
            return False
        self.first = False
        return HeadingRule.condition(self,block)


class ListItemRule(Rule):
    '''
    自身类型：type
    判断规则及Boolen返回值方法：   condition(self,block)
    符合当前规则调用指定对象的处理方法的方法：     action(self,block,handler)
    '''
    type = "listitem"
    def condition(self,block):
        return block[0] == '_'
    def action(self,block,handler):
        handler.start(self.type)
        handler.feed(block[1:].strip)
        handler.end(self,type)
        return True

class ListRule(ListItemRule):
    '''
    自身类型：type
    判断规则及Boolen返回值方法：   condition(self,block)
    符合当前规则调用指定对象的处理方法的方法：     action(self,block,handler)
    '''
    type = 'list'
    inside = False
    def condition(self,block):
        return True
    def action(self,block,handler):
        if not self.inside and ListItemRule.condition(self,block):
            handler.start(self.type)
            self.inside = True
        elif self.inside and not ListItemRule.condition(self,block):
            handler.end(self.type)
            self.inside = False
        return False

class ParagraphRule(Rule):
    '''
    自身类型：type
    判断规则及Boolen返回值方法：   condition(self,block)
    符合当前规则调用指定对象的处理方法的方法：     action(self,block,handler)
    '''
    type = 'paragraph'
    def condition(self,block):
        return True

###########################----Handler-----#################################
class Handler:
    #初始化创建变量：   __init__(self)
    #获取变量值：   getV(self)
    #预处理type函数，确定对应哪个规则，将处理结果进行调用检测函数:   start/end/sub(self,type)
    #判断预处理得出的方法名是否存在，存在则调取，否则跳过:    test(self,prefix,type,*args)
    def __init__(self):
        self.output = []

    def getV(self):
        return self.output

    def start(self, name):
        self.test ('start_', name)
    def end(self, name):
        self.test ('end_', name)
    def sub(self, name):
        def substitution(match):
            result = self.test ('sub_', name, match)
            if result is None: match.group(0)
            return result
        return substitution

    def test(self, prefix, name, *args):
        method = getattr(self,prefix+name,None)
        if callable(method): return method(*args)

class HTMLRenderer(Handler):
    #若干处理函数:   prefix_type(self)
    def start_document(self):
        self.output.append('<html><head><title>...</title></head>\n<body>\n')
    def end_document(self):
        self.output.append('\n</body>\n</html>')
    def start_paragraph(self):
        self.output.append('<p>')
    def end_paragraph(self):
        self.output.append ('</p>')
    def start_heading(self):
        self.output.append('<h2>')
    def end_heading(self):
        self.output.append ('</h2>')
    def start_list(self):
        self.output.append ('<ul>')
    def end_list(self):
        self.output.append ('</ul>')
    def start_listitem(self):
        self.output.append ('<li>')
    def end_listitem(self):
        self.output.append ('</li>')
    def start_title(self):
        self.output.append ('<h1>')
    def end_title(self):
        self.output.append ('</h1>')
    def sub_emphasis(self, match):
        return '<em>%s</em>' % match.group(0)
    def sub_url(self,  match):
        return '<a href="%s">%s</a>' % (match.group(1),match.group(1))
    def sub_mail(self,  match):
        return '<a href="mailto:%s">%s</a>' % (match.group(1),match.group(1))
    def feed(self, data):
        self.output.append (data)


###########################----Parser-----#################################
class Parser:
    #初始化需要用到的信息，处理对象handler,规则数组rules，过滤器数组等：    __init__(self,handler)
    #文本获取,保存数据： getText(self)
    #向rules添加规则：  addRule(self,rule)
    #向filters添加过滤器：   addFilter(self,filter)
    #处理结果输出到文本.html：  writer(self,str)
    #遍历每一个规则器或过滤器对数据block进行判断处理：  action(self)

    def __init__(self,handler):

        self.handler = handler
        self.rules = []
        self.filters = []
        self.blocks=[]

    def getText(self):
        str = ''
        with open("text_input.txt","r") as f0:
            while 1 :
                str = f0.readline()
                if not str:
                    break
                elif str =='\n':
                    continue
                self.blocks.append(str.strip())
        pass

    def addRule(self,rule):
        self.rules.append(rule)

    def addFilter(self,pattern,filter):
        def filter_action(block, handler):
            return re.sub(pattern, handler.sub(filter),block)
        self.filters.append(filter_action)

    def writer(self,str):
        count0 = 0              #为了排版美观，一组标签一行
        with open("out.html","w") as fw:
            for i in str:
                count0 += 1
                fw.write(i)
                if count0%3 ==0:
                    fw.write("\n")
                pass

        pass

    def action(self):
        self.handler.start('document')
        print("---------------")
        self.getText()

        for block in self.blocks:
            print(block)
            for filter in self.filters:
                block = filter(block, self.handler)
            for rule in self.rules:
                if rule.condition(block):
                    last = rule.action(block, self.handler)
                    if last:break
        self.handler.end('document')
        print(len(handler.getV()))
        self.writer(handler.getV())


class AddRuleOrFilter(Parser):
    #均定义在初始化中： __init__(self,handler)

    def __init__(self,handler):
        #调用父类方法添加不同规则
        Parser.__init__(self,handler)
        self.addRule(ListRule())
        self.addRule(ListItemRule())
        self.addRule(TitleRule())
        self.addRule(HeadingRule())
        self.addRule(ParagraphRule())
        
        self.addFilter(r'([A-Z]{1}[a-z]+[\s+|,]){3,}', 'emphasis')
        self.addFilter(r'(http://[\.a-z0-9A-Z/]+)', 'url')
        self.addFilter(r'([\.a-zA-Z]+@[\.a-zA-Z]+[a-zA-Z]+)','mail')

handler = HTMLRenderer()    #创建渲染html对象
parser = AddRuleOrFilter(handler)   #传递渲染html对象并创建文本解析对象
parser.action()

text_input.txt

Welcome to World Wide Spam. Inc.
These are the corporate web pages of World Wide Spam, Inc. We hope you find your stay enjoyable, and that you will sample many of our products.


A short history of the company


World Wide Spam was started in the summer of 2000. The business concept was to ride the dot-com wave and to make money both through bulk email and by selling canned meat online.


After receiving several complaints from customers who weren't satisfied by their bulk email, World Wide Spam altered their profile, and focused 100% on canned goods. Today, they rank as the world's 13,892 online supplier of SPAM.

Destinations

From this page you may visit several of our interesting web pages:


 - What is SPAM?(http://wwspam.fu/whatisspam)

 - How do they make it?(http://wwspam.fu/howtomakeit)

 - Why should I eat it?(http://wwspam.fu/whyeatit)

How to get in touch with us


You can get in touch with us in *many* ways: By phone (555-1234), by email (wwspam@wwspam.fu) or by visiting our customer feedback page (http://wwspam.fu/feedback)

out.html

<html><head><title>...</title></head>
<body>
<h1>Welcome to World Wide Spam. Inc.
</h1><p>These are the corporate web pages of <em>World Wide Spam,</em> Inc. We hope you find your stay enjoyable, and that you will sample many of our products.
</p><h2>A short history of the company
</h2><p><em>World Wide Spam </em>was started in the summer of 2000. The business concept was to ride the dot-com wave and to make money both through bulk email and by selling canned meat online.
</p><p>After receiving several complaints from customers who weren't satisfied by their bulk email, <em>World Wide Spam </em>altered their profile, and focused 100% on canned goods. Today, they rank as the world's 13,892 online supplier of SPAM.
</p><h2>Destinations
</h2><p>From this page you may visit several of our interesting web pages:
</p><p>- What is SPAM?(<a href="http://wwspam.fu/whatisspam">http://wwspam.fu/whatisspam</a>)
</p><p>- How do they make it?(<a href="http://wwspam.fu/howtomakeit">http://wwspam.fu/howtomakeit</a>)
</p><p>- Why should I eat it?(<a href="http://wwspam.fu/whyeatit">http://wwspam.fu/whyeatit</a>)
</p><h2>How to get in touch with us
</h2><p>You can get in touch with us in *many* ways: By phone (555-1234), by email (<a href="mailto:wwspam@wwspam.fu">wwspam@wwspam.fu</a>) or by visiting our customer feedback page (<a href="http://wwspam.fu/feedback">http://wwspam.fu/feedback</a>)
</p>
</body>
</html>