现象
Illegal mix of collations (utf8_general_ci,IMPLICIT) and (utf8mb4_general_ci,COERCIBLE) for operation ‘like’ (errno 1267) (sqlstate HY000) during query
原因
在mysql中设置utf8-general-ci时是无法插入4字节utf-8字符的,如果mysql中设置了utf8-general-ci在插入4字节字符时则会出问题,
解决方案
为了避免此类问题, 可以提前对字符串做判断, 判断方法的代码片段如下:
package com.jd.market.sc.utils;
import org.junit.Assert;
import org.junit.Test;
import java.nio.charset.Charset;
/**
* 处理由于mysql配置字符编码utf8 general ci, 而应用会接受utf8-mb4字节的情况
* Created by zhaoyukai on 2017/9/28.
*/
public class UTF8MB4 {
private final static Charset UTF8 = Charset.forName("UTF-8");
/**
* 判断字符串中是否存在4字节字符
*
* @param input 输入字符串
* @return 包含4字节返回true, 否则为false
*/
public static boolean containsMb4Char(String input) {
if (input == null) {
return false;
}
byte[] bytes = input.getBytes(UTF8);
for (int i = 0; i < bytes.length; i++) {
byte b = bytes[i];
//four bytes
if ((b & 0XF0) == 0XF0) {
return true;
} else if ((b & 0XE0) == 0XE0) {
//three bytes
//forward 2 byte
i += 2;
} else if ((b & 0XC0) == 0XC0) {
i += 1;
}
}
return false;
}
/**
* 替换可能存在的utf8 4字节字符
*
* @param input 输入字符串
* @param replacement 替换为的字符串
* @return 替换后的utf8字符串
*/
public static String replaceMb4Char(String input, String replacement) {
if (input == null) {//为空直接返回
throw new IllegalArgumentException("input can not be null when replaceMb4Char");
}
if(!containsMb4Char(input)){//不包含四字节的utf8mb4字符,直接返回原值
return input;
}
//包含四字节的utf8mb4字符
StringBuilder sb = new StringBuilder(input.length());
byte[] bytes = input.getBytes(UTF8);
char[] chars = input.toCharArray();
int charIdx = 0;
for (int i = 0; i < bytes.length; i++) {
byte b = bytes[i];
//four bytes
if ((b & 0XF0) == 0XF0) {
sb.append(replacement);
//utf-8四字节字符unicode后变为2个字符, 故字符下标多加1
charIdx += 2;
i += 3;
continue;
} else if ((b & 0XE0) == 0XE0) {
//three bytes
//forward 2 byte
i += 2;
} else if ((b & 0XC0) == 0XC0) {
i += 1;
}
sb.append(chars[charIdx]);
charIdx++;
}
return sb.toString();
}
@Test
public void testReplacement() {
String input = "A啊中\uD83D\uDE00\uD83D\uDC7D\uD83D\uDC94哈哈哈";
String output = UTF8MB4.replaceMb4Char(input, "");
String expect = "A啊中哈哈哈";
Assert.assertEquals(expect, output);
}
@Test
public void testContainsMb4() {
testFalse("呵呵呵");
testFalse("AAAA");
testFalse(",,,");
testFalse("中国。。,&………………");
testFalse("我们mmm他们..你们abcdddd牛m");
testTrue("中山大?");
testTrue("\uD840\uDC87");
testTrue("?");
testTrue("【好玩吧、吧?秒审核不墨迹(好玩吧)赏2.3元】");
}
@Test
public void testReplacePerf() {
long startMm = System.currentTimeMillis();
int times = 10000000;
while (times > 0) {
UTF8MB4.replaceMb4Char("朝阳区和平街胜古东里1号楼4单元60\uD83D\uDC7D\uD83D\uDC941", "");
times--;
}
long end = System.currentTimeMillis();
long used = end - startMm;
System.out.println(used);
}
@Test
public void testContainsPerf() {
long startMm = System.currentTimeMillis();
int times = 10000000;
while (times > 0) {
UTF8MB4.containsMb4Char("朝阳区和平街胜古东里1号楼4单元60\uD83D\uDC7D\uD83D\uDC941");
times--;
}
long end = System.currentTimeMillis();
long used = end - startMm;
System.out.println(used);
}
void testTrue(String input) {
boolean contains = UTF8MB4.containsMb4Char(input);
Assert.assertTrue(contains);
}
void testFalse(String input) {
boolean contains = UTF8MB4.containsMb4Char(input);
Assert.assertFalse(contains);
}
@Test
public void testMb4() {
String chs = "A啊中\uD83D\uDE00\uD83D\uDC7D\uD83D\uDC94";
// byte[] bytes = chs.getBytes(UTF8);
byte[] masks = {
(byte) 0X00,
(byte) 0X80,
(byte) 0XC0,
(byte) 0XE0,
};
char c = 'ﭾ';
int count = 1;
System.out.println(String.format("%02X", (int) c));
while (count < 110000) {
char nc = (char) ((int) c + count);
System.out.print(nc);
byte[] bytes = String.valueOf(nc).getBytes(UTF8);
System.out.print(String.format("%02X", bytes[0]));
count++;
if (count % 50 == 0) {
System.out.println();
}
}
}
}
转载自:http://outofmemory.cn/code-snippet/82878/java-utf-8-codec-utf8-mb4