我的第一个想法是使用tokenizer,但经过一些测试后,似乎这个人无法识别短开的php标签。
我决定编写一个描述两个陷阱的模式:注释和字符串。
$str = <<
$b = '?>';
$c = '';$d = '';$e = '';
$a = 1;?>
EOD;
$pattern = <<
~
# subpatterns definitions
(?(DEFINE)
(? # single quote string
' [^'\\]*+ (?s: \\. [^'\\]* )*+ '
)
(? # double quote string
" [^"\\]*+ (?s: \\. [^"\\]* )*+ "
)
(? [a-zA-Z_][a-zA-Z0-9_]* )
(? # heredoc string
<<< (?| \g | " ( \g ) " ) \R
(?: .* \R )*?
\g{-1} ;? \R
)
(? # nowdoc string
<<< '( \g )' \R
(?: .* \R )*?
\g{-1} ;? \R
)
(? \g | \g | \g | \g )
(? // .* ) # singleline comment
(? /\* [^*]*+ (?: \* (?!/) [^*]* )*+ (?:\*/)? ) # multiline comment
(? \g | \g )
)
# main pattern
(?
[^"'?]*+
(?:
\g [^"'?]*
|
\g [^"'?]*
|
< [^"'?]*
|
\? (?!>) [^"'?]*
)*+
)
(?: \?> )?
~x
EOD;
if ( preg_match_all($pattern, $str, $matches) )
print_r($matches['content']);