最近在用富文本编辑器,查了好多防XSS攻击的代码,都感觉不怎么好用。首先这些方法都是过滤非法字符或者字符串,标签字符串千变万化,难于过滤全面、其次过滤后的代码甚至会丧失正常功能。因此我考虑只取我们需要的部分。
摒弃了过滤法,我考虑使用标签分析法。
参考了百度UEditor前端的过滤方法,它将允许标签的tag和属性列了出来并作以保留(白名单)。因此我也考虑使用白名单法:
allowParams=array(//白名单
'a'=>array('target','href','title','class','style'),
'abbr'=>array('title','class','style'),
'address' =>array('class','style'),
'area' =>array('shape','coords','href','alt'),
'article' =>array(),
'aside' =>array(),
'audio' =>array('autoplay','controls','loop','preload','src','class','style'),
'b' =>array('class','style'),
'bdi' =>array('dir'),
'bdo' =>array('dir'),
'big' =>array(),
'blockquote' =>array('cite','class','style'),
'br' =>array(),
'caption' =>array('class','style'),
'center' =>array(),
'cite' =>array(),
'code' =>array('class','style'),
'col' =>array('align','valign','span','width','class','style'),
'colgroup' =>array('align','valign','span','width','class','style'),
'dd' =>array('class','style'),
'del' =>array('datetime'),
'details' =>array('open'),
'div' =>array('class','style'),
'dl' =>array('class','style'),
'dt' =>array('class','style'),
'em' =>array('class','style'),
'font' =>array('color','size','face'),
'footer' =>array(),
'h1' =>array('class','style'),
'h2' =>array('class','style'),
'h3' =>array('class','style'),
'h4' =>array('class','style'),
'h5' =>array('class','style'),
'h6' =>array('class','style'),
'header' =>array(),
'hr' =>array(),
'i' =>array('class','style'),
'img' =>array('src','alt','title','width','height','id','_src','loadingclass','class','data-latex'),
'ins' =>array('datetime'),
'li' =>array('class','style'),
'mark' =>array(),
'nav' =>array(),
'ol' =>array('class','style'),
'p' =>array('class','style'),
'pre' =>array('class','style'),
's' =>array(),
'section' =>array(),
'small' =>array(),
'span' =>array('class','style'),
'sub' =>array('class','style'),
'sup' =>array('class','style'),
'strong' =>array('class','style'),
'table' =>array('width','border','align','valign','class','style'),
'tbody' =>array('align','valign','class','style'),
'td' =>array('width','rowspan','colspan','align','valign','class','style'),
'tfoot' =>array('align','valign','class','style'),
'th' =>array('width','rowspan','colspan','align','valign','class','style'),
'thead' =>array('align','valign','class','style'),
'tr' =>array('rowspan','align','valign','class','style'),
'tt' =>array(),
'u' =>array(),
'text' =>array(),//simple_html_dom里文字tag用text表示
'ul' =>array('class','style')
);
白名单有了之后就要开始处理了。既然要处理html就要有处理html的工具。这里我选用simple_html_dom(百度可以查到,这里就不发链接了)
下面分析内容的dom:
$uedata="<div>$html</div>";//simple_html_dom会把根节点转换为root
$dom=str_get_html($html);//用simple_html_dom解析内容
这里解析部分就完成了。由于simple_html_dom解析的是树形结构,所以遍历时需要用图的遍历方式,这里我选用了dfs遍历
$doms=$dom->root->children;//取根节点准备dfs遍历
$this->dfs($doms);//遍历处理
$html=$dom->outertext;//将处理结果返回
$dom->clear();//释放内存
这里遍历处理很简单了,使用dfs递归就可以了。逐个判断tag和params,合格的留下,不合格的删掉。这里我的代码是这样的:
public function dfs($doms){
foreach ($doms as $domitem) {
if($domitem->tag=='text'){
$domitem->innertext=htmlspecialchars($domitem->innertext);
}
if(!in_array($domitem->tag, array_keys($this->allowParams))){
$domitem->outertext="";
}else{
foreach ($domitem->attr as $key => $value) {
$d=strtolower($value);
if($domitem->tag=="a"&&$key=="href"&&strpos($d, "http")!==0)
$domitem->removeAttribute($key);
else if(!in_array($key, $this->allowParams[$domitem->tag])||strpos($d, 'script')!==false||strpos($d, '&#x')!==false||($domitem->tag=='img'&&$key=="style"&&strpos($d, 'expression')!==false))
$domitem->removeAttribute($key);
}
if(!empty($domitem->children))
$this->dfs($domitem->nodes);
}
}
}
(感谢 @轻轻的烟雾 的提醒,现已更正新增了a标签的href验证)
到此整个处理就结束了。整合后放到ThinkPHP扩展类,代码如下:
<?php
<?php
namespace Org\Util;
require 'simple_html_dom.php';
class UEditorXSSRejector{
private $allowParams;
public function parse($uedata){
$uedata=str_replace("&#", "", $uedata);
$uedata=preg_replace("/<<+/","<<",$uedata);
$this->allowParams=array(//白名单
'a'=>array('target','href','title','class','style'),
'abbr'=>array('title','class','style'),
'address' =>array('class','style'),
'area' =>array('shape','coords','href','alt'),
'article' =>array(),
'aside' =>array(),
'audio' =>array('autoplay','controls','loop','preload','src','class','style'),
'b' =>array('class','style'),
'bdi' =>array('dir'),
'bdo' =>array('dir'),
'big' =>array(),
'blockquote' =>array('cite','class','style'),
'br' =>array(),
'caption' =>array('class','style'),
'center' =>array(),
'cite' =>array(),
'code' =>array('class','style'),
'col' =>array('align','valign','span','width','class','style'),
'colgroup' =>array('align','valign','span','width','class','style'),
'dd' =>array('class','style'),
'del' =>array('datetime'),
'details' =>array('open'),
'div' =>array('class','style'),
'dl' =>array('class','style'),
'dt' =>array('class','style'),
'em' =>array('class','style'),
'font' =>array('color','size','face'),
'footer' =>array(),
'h1' =>array('class','style'),
'h2' =>array('class','style'),
'h3' =>array('class','style'),
'h4' =>array('class','style'),
'h5' =>array('class','style'),
'h6' =>array('class','style'),
'header' =>array(),
'hr' =>array(),
'i' =>array('class','style'),
'img' =>array('src','alt','title','width','height','id','_src','loadingclass','class','data-latex','style'),
'ins' =>array('datetime'),
'li' =>array('class','style'),
'mark' =>array(),
'nav' =>array(),
'ol' =>array('class','style'),
'p' =>array('class','style'),
'pre' =>array('class','style'),
's' =>array(),
'section' =>array(),
'small' =>array(),
'span' =>array('class','style'),
'sub' =>array('class','style'),
'sup' =>array('class','style'),
'strong' =>array('class','style'),
'table' =>array('width','border','align','valign','class','style'),
'tbody' =>array('align','valign','class','style'),
'td' =>array('width','rowspan','colspan','align','valign','class','style'),
'tfoot' =>array('align','valign','class','style'),
'th' =>array('width','rowspan','colspan','align','valign','class','style'),
'thead' =>array('align','valign','class','style'),
'tr' =>array('rowspan','align','valign','class','style'),
'tt' =>array(),
'u' =>array(),
'text' =>array(),
'ul' =>array('class','style')//,
//'video' =>array('autoplay','controls','loop','preload','src','height','width','class','style')
);
$uedata="<div>$uedata</div>";
$dom=str_get_html($uedata);
$doms=$dom->root->children;
$this->dfs($doms);
$html=$dom->outertext;
$dom->clear();
return $html;
}
public function dfs($doms){
foreach ($doms as $domitem) {
if($domitem->tag=='text'){
$domitem->innertext=htmlspecialchars($domitem->innertext);
}
if(!in_array($domitem->tag, array_keys($this->allowParams))){
$domitem->outertext="";
}else{
foreach ($domitem->attr as $key => $value) {
$d=strtolower($value);
if($domitem->tag=="a"&&$key=="href"&&strpos($d, "http")!==0)
$domitem->removeAttribute($key);
else if(!in_array($key, $this->allowParams[$domitem->tag])||strpos($d, 'script')!==false||strpos($d, '&#x')!==false||($domitem->tag=='img'&&$key=="style"&&strpos($d, 'expression')!==false))
$domitem->removeAttribute($key);
}
if(!empty($domitem->children))
$this->dfs($domitem->nodes);
}
}
}
}
?>
将simple_html_dom.php重命名为simple_html_dom.class.php放到\Org\Utils下,增加命名空间声明。然后将上述代码保存为UEditorXSSRejector.class.php放在同目录下。然后在function里调用:
use Org\Util;
function remove_xss($val) {
$xss = new \Org\Util\UEditorXSSRejector();
return $xss->parse($val);
}
大工告成。拿wangEditor提交到Controller试了一下不管是加粗倾斜有序无序列表链接图片等等都感觉没什么问题。