对于这种“外科”分析(在这种分析中,您希望隔离特定的标记,而不是创建完整的分层文档),pyparsing的makeHTMLTags方法非常有用。在
请参阅下面带注释的脚本,其中显示了解析器的创建,并将其用于parseTag和{}方法:import pyparsing as pp
def make_tag_parser(tag):
# makeHTMLTags returns 2 parsers, one for the opening tag and one for the
# closing tag - we only need the opening tag; the parser will return parsed
# fields of the tag itself
tag_parser = pp.makeHTMLTags(tag)[0]
# instead of returning parsed bits of the tag, use originalTextFor to
# return the raw tag as token[0] (specifying asString=False will retain
# the parsed attributes and tag name as attributes)
parser = pp.originalTextFor(tag_parser, asString=False)
# add one more callback to define the 'raw' attribute, copied from t[0]
def add_raw_attr(t):
t['raw'] = t[0]
parser.addParseAction(add_raw_attr)
return parser
# parseTag to find all the matches and report their attributes
def parseTag(tag, s):
return make_tag_parser(tag).searchString(s)
content = """This is a string"""
tag_matches = parseTag("photo", content)
for match in tag_matches:
print(match.dump())
print("raw: {!r}".format(match.raw))
print("tag: {!r}".format(match.tag))
print("id: {!r}".format(match.id))
# transform tag to perform tag->div transforms
def replaceTag(tag, transform, s):
parser = make_tag_parser(tag)
# add one more parse action to do transform
parser.addParseAction(lambda t: transform.format(**t))
return parser.transformString(s)
print(replaceTag("photo",
'
content))
印刷品:
^{pr2}$