做项目的时候遇到emoji表情的问题,存储到mysql数据库时数据库会报错。
原因:UTF-8编码有可能是两个、三个、四个字节。Emoji表情是4个字节,而Mysql的utf8编码最多3个字节,所以数据插不进去。
数据库报错跟数据库的sql_mode配置有关,具体参考
http://blog.csdn.net/wulantian/article/details/8905573
http://blog.csdn.net/wulantian/article/details/8905573
解决方案一:数据库解决方法(将Mysql的编码从utf8转换成utf8mb4)。
参考:http://www.tuicool.com/articles/qeEFn2R
1. 修改my.ini
[client]
default-character-set = utf8mb4
[mysql]
default-character-set = utf8mb4
[mysqld]
character-set-client-handshake = FALSE
character-set-server = utf8mb4
collation-server = utf8mb4_unicode_ci
2. 修改您的数据库、表、字段
# 对每一个数据库:
ALTER DATABASE 这里数据库名字 CHARACTER SET = utf8mb4 COLLATE = utf8mb4_unicode_ci;
# 对每一个表:
ALTER TABLE 这里是表名字 CONVERT TO CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci;
# 对每一个字段:
ALTER TABLE 这里是表名字 CHANGE 字段名字 字段名字 VARCHAR(191) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci;
3.重启mysql
4.通过下面命令查询是否设置成功
SHOW VARIABLES WHERE Variable_name LIKE 'character\_set\_%' OR Variable_name LIKE 'collation%';
# 运行上面代码显示下面结果
# +--------------------------+--------------------+
# | Variable_name | Value |
# +--------------------------+--------------------+
# | character_set_client | utf8mb4 |
# | character_set_connection | utf8mb4 |
# | character_set_database | utf8mb4 |
# | character_set_filesystem | binary |
# | character_set_results | utf8mb4 |
# | character_set_server | utf8mb4 |
# | character_set_system | utf8 |
# | collation_connection | utf8mb4_unicode_ci |
# | collation_database | utf8mb4_unicode_ci |
# | collation_server | utf8mb4_unicode_ci |
# +--------------------------+--------------------+
解决方案二:php代码解决方法(匹配4字节的字符):
方法一:
问题:1、会匹配这种字符:①
2、不能匹配所有的emoji表情
function emoji_match($string){
preg_match('/([0-9#][\x{20E3}])|[\x{00ae}\x{00a9}\x{203C}\x{2047}\x{2048}\x{2049}\x{3030}\x{303D}\x{2139}\x{2122}\x{3297}\x{3299}][\x{FE00}-\x{FEFF}]?|[\x{2190}-\x{21FF}][\x{FE00}-\x{FEFF}]?|[\x{2300}-\x{23FF}][\x{FE00}-\x{FEFF}]?|[\x{2460}-\x{24FF}][\x{FE00}-\x{FEFF}]?|[\x{25A0}-\x{25FF}][\x{FE00}-\x{FEFF}]?|[\x{2600}-\x{27BF}][\x{FE00}-\x{FEFF}]?|[\x{2900}-\x{297F}][\x{FE00}-\x{FEFF}]?|[\x{2B00}-\x{2BF0}][\x{FE00}-\x{FEFF}]?|[\x{1F000}-\x{1F6FF}][\x{FE00}-\x{FEFF}]?/u', $string, $matches);
return $matches;
}
方法二:移除emoji表情的方法
问题:1、此方法会连同中文一起处理
function removeEmojis( $string ) {
$string = str_replace( "?", "{%}", $string );
$string = mb_convert_encoding( $string, "ISO-8859-1", "UTF-8" );
$string = mb_convert_encoding( $string, "UTF-8", "ISO-8859-1" );
$string = str_replace( array( "?", "? ", " ?" ), array(""), $string );
$string = str_replace( "{%}", "?", $string );
return trim( $string );
}
方法三:
问题:1、不能匹配所有的emoji表情
function filter_emoji($params){
$params = json_encode($params);
$params = preg_match("#(\\\ud[0-9a-f]{3})#ie",$params, $matches);
// $params = json_decode($params);
return $params;
}
方法四:过滤大于三字节的utf-8字符
function filterUtf8($str)
{
/*utf8 编码表:
* Unicode符号范围 | UTF-8编码方式
* u0000 0000 - u0000 007F | 0xxxxxxx
* u0000 0080 - u0000 07FF | 110xxxxx 10xxxxxx
* u0000 0800 - u0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
*
*/
$re = '';
$str = str_split(bin2hex($str), 2);
$mo = 1<<7;
$mo2 = $mo | (1 << 6);
$mo3 = $mo2 | (1 << 5); //三个字节
$mo4 = $mo3 | (1 << 4); //四个字节
$mo5 = $mo4 | (1 << 3); //五个字节
$mo6 = $mo5 | (1 << 2); //六个字节
for ($i = 0; $i < count($str); $i++)
{
if ((hexdec($str[$i]) & ($mo)) == 0)
{
$re .= chr(hexdec($str[$i]));
continue;
}
//4字节 及其以上舍去
if ((hexdec($str[$i]) & ($mo6) ) == $mo6)
{
$i = $i +5;
continue;
}
if ((hexdec($str[$i]) & ($mo5) ) == $mo5)
{
$i = $i +4;
continue;
}
if ((hexdec($str[$i]) & ($mo4) ) == $mo4)
{
$i = $i +3;
continue;
}
if ((hexdec($str[$i]) & ($mo3) ) == $mo3 )
{
$i = $i +2;
if (((hexdec($str[$i]) & ($mo) ) == $mo) && ((hexdec($str[$i - 1]) & ($mo) ) == $mo) )
{
$r = chr(hexdec($str[$i - 2])).
chr(hexdec($str[$i - 1])).
chr(hexdec($str[$i]));
$re .= $r;
}
continue;
}
if ((hexdec($str[$i]) & ($mo2) ) == $mo2 )
{
$i = $i +1;
if ((hexdec($str[$i]) & ($mo) ) == $mo)
{
$re .= chr(hexdec($str[$i - 1])) . chr(hexdec($str[$i]));
}
continue;
}
}
return $re;
}