如果有递归嵌套表达式,可以在逗号上拆分,并验证它们是否与pyparsing匹配:import pyparsing as pp
def CommaSplit(txt):
''' Replicate the function of str.split(',') but do not split on nested expressions or in quoted strings'''
com_lok=[]
comma = pp.Suppress(',')
# note the location of each comma outside an ignored expression:
comma.setParseAction(lambda s, lok, toks: com_lok.append(lok))
ident = pp.Word(pp.alphas+"_", pp.alphanums+"_") # python identifier
ex1=(ident+pp.nestedExpr(opener='')) # Ignore everthing inside nested '< >'
ex2=(ident+pp.nestedExpr()) # Ignore everthing inside nested '( )'
ex3=pp.Regex(r'("|\').*?\1') # Ignore everything inside "'" or '"'
atom = ex1 | ex2 | ex3 | comma
expr = pp.OneOrMore(atom) + pp.ZeroOrMore(comma + atom )
try:
result=expr.parseString(txt)
except pp.ParseException:
return [txt]
else:
return [txt[st:end] for st,end in zip([0]+[e+1 for e in com_lok],com_lok+[len(txt)])]
tests='''\
obj<1, 2, 3>, x(4, 5), "msg, with comma"
nesteobj<1, sub<6, 7>, 3>, nestedx(4, y(8, 9), 5), "msg, with comma"
nestedobj<1, sub<6, 7>, 3>, nestedx(4, y(8, 9), 5), 'msg, with comma', additional<1, sub<6, 7>, 3>
bare_comma<1, sub(6, 7), 3>, x(4, y(8, 9), 5), , 'msg, with comma', obj<1, sub<6, 7>, 3>
bad_close<1, sub<6, 7>, 3), x(4, y(8, 9), 5), 'msg, with comma', obj<1, sub<6, 7>, 3)
'''
for te in tests.splitlines():
result=CommaSplit(te)
print(te,'==>\n\t',result)
印刷品:
^{pr2}$
当前行为就像'(something does not split), b, "in quotes", c'.split','),包括保留前导空格和引号。从字段中去掉引号和前导空格很简单。在
将try下的else更改为:else:
rtr = [txt[st:end] for st,end in zip([0]+[e+1 for e in com_lok],com_lok+[len(txt)])]
if strip_fields:
rtr=[e.strip().strip('\'"') for e in rtr]
return rtr