mysql 藏文字典_mysql无法压缩存储表情

最新推荐文章于 2023-11-02 20:32:22 发布

萨缪尔

最新推荐文章于 2023-11-02 20:32:22 发布

阅读量280

点赞数

文章标签： mysql 藏文字典

本文链接：https://blog.csdn.net/weixin_33662430/article/details/113200326

版权

importjava.util.regex.Matcher;importjava.util.regex.Pattern;importorg.apache.commons.lang.StringUtils;/***

* @ClassName: EmojiUtils

* @Description:过滤emoji表情

*@author: Simon

* @date: 2017年9月22日上午11:11:00*/

public classEmojiUtils {public final static String unicodeReg = "[" + "\u4E00-\u9FBF" + //：CJK//统一表意符号//(CJK//Unified//Ideographs)

"\u4DC0-\u4DFF" + //：易经六十四卦符号 (Yijing Hexagrams Symbols)

"\u0000-\u007F" + //：C0控制符及基本拉丁文 (C0 Control and Basic Latin)

"\u0080-\u00FF" + //：C1控制符及拉丁：补充-1 (C1 Control and Latin 1//Supplement)

"\u0100-\u017F" + //：拉丁文扩展-A (Latin Extended-A)

"\u0180-\u024F" + //：拉丁文扩展-B (Latin Extended-B)

"\u0250-\u02AF" + //：国际音标扩展 (IPA Extensions)

"\u02B0-\u02FF" + //：空白修饰字母 (Spacing Modifiers)

"\u0300-\u036F" + //：结合用读音符号 (Combining Diacritics Marks)

"\u0370-\u03FF" + //：希腊文及科普特文 (Greek and Coptic)

"\u0400-\u04FF" + //：西里尔字母 (Cyrillic)

"\u0500-\u052F" + //：西里尔字母补充 (Cyrillic Supplement)

"\u0530-\u058F" + //：亚美尼亚语 (Armenian)

"\u0590-\u05FF" + //：希伯来文 (Hebrew)

"\u0600-\u06FF" + //：阿拉伯文 (Arabic)

"\u0700-\u074F" + //：叙利亚文 (Syriac)

"\u0750-\u077F" + //：阿拉伯文补充 (Arabic Supplement)

"\u0780-\u07BF" + //：马尔代夫语 (Thaana)//"\u07C0-\u077F"+//：西非书面语言 (N'Ko)

"\u0800-\u085F" + //：阿维斯塔语及巴列维语 (Avestan and Pahlavi)

"\u0860-\u087F" + //：Mandaic

"\u0880-\u08AF" + //：撒马利亚语 (Samaritan)

"\u0900-\u097F" + //：天城文书 (Devanagari)

"\u0980-\u09FF" + //：孟加拉语 (Bengali)

"\u0A00-\u0A7F" + //：锡克教文 (Gurmukhi)

"\u0A80-\u0AFF" + //：古吉拉特文 (Gujarati)

"\u0B00-\u0B7F" + //：奥里亚文 (Oriya)

"\u0B80-\u0BFF" + //：泰米尔文 (Tamil)

"\u0C00-\u0C7F" + //：泰卢固文 (Telugu)

"\u0C80-\u0CFF" + //：卡纳达文 (Kannada)

"\u0D00-\u0D7F" + //：德拉维族语 (Malayalam)

"\u0D80-\u0DFF" + //：僧伽罗语 (Sinhala)

"\u0E00-\u0E7F" + //：泰文 (Thai)

"\u0E80-\u0EFF" + //：老挝文 (Lao)

"\u0F00-\u0FFF" + //：藏文 (Tibetan)

"\u1000-\u109F" + //：缅甸语 (Myanmar)

"\u10A0-\u10FF" + //：格鲁吉亚语 (Georgian)

"\u1100-\u11FF" + //：朝鲜文 (Hangul Jamo)

"\u1200-\u137F" + //：埃塞俄比亚语 (Ethiopic)

"\u1380-\u139F" + //：埃塞俄比亚语补充 (Ethiopic Supplement)

"\u13A0-\u13FF" + //：切罗基语 (Cherokee)

"\u1400-\u167F" + //：统一加拿大土著语音节 (Unified Canadian Aboriginal//Syllabics)

"\u1680-\u169F" + //：欧甘字母 (Ogham)

"\u16A0-\u16FF" + //：如尼文 (Runic)

"\u1700-\u171F" + //：塔加拉语 (Tagalog)

"\u1720-\u173F" + //：Hanunóo

"\u1740-\u175F" + //：Buhid

"\u1760-\u177F" + //：Tagbanwa

"\u1780-\u17FF" + //：高棉语 (Khmer)

"\u1800-\u18AF" + //：蒙古文 (Mongolian)

"\u18B0-\u18FF" + //：Cham

"\u1900-\u194F" + //：Limbu

"\u1950-\u197F" + //：德宏泰语 (Tai Le)

"\u1980-\u19DF" + //：新傣仂语 (New Tai Lue)

"\u19E0-\u19FF" + //：高棉语记号 (Kmer Symbols)

"\u1A00-\u1A1F" + //：Buginese

"\u1A20-\u1A5F" + //：Batak

"\u1A80-\u1AEF" + //：Lanna

"\u1B00-\u1B7F" + //：巴厘语 (Balinese)

"\u1B80-\u1BB0" + //：巽他语 (Sundanese)

"\u1BC0-\u1BFF" + //：Pahawh Hmong

"\u1C00-\u1C4F" + //：雷布查语(Lepcha)

"\u1C50-\u1C7F" + //：Ol Chiki

"\u1C80-\u1CDF" + //：曼尼普尔语 (Meithei/Manipuri)

"\u1D00-\u1D7F" + //：语音学扩展 (Phone tic Extensions)

"\u1D80-\u1DBF" + //：语音学扩展补充 (Phonetic Extensions Supplement)

"\u1DC0-\u1DFF" + //结合用读音符号补充 (Combining Diacritics Marks//Supplement)

"\u1E00-\u1EFF" + //：拉丁文扩充附加 (Latin Extended Additional)

"\u1F00-\u1FFF" + //：希腊语扩充 (Greek Extended)

"\u2000-\u206F" + //：常用标点 (General Punctuation)

"\u2070-\u209F" + //：上标及下标 (Superscripts and Subscripts)

"\u20A0-\u20CF" + //：货币符号 (Currency Symbols)

"\u20D0-\u20FF" + //：组合用记号 (Combining Diacritics Marks for Symbols)

"\u2100-\u214F" + //：字母式符号 (Letterlike Symbols)

"\u2150-\u218F" + //：数字形式 (Number Form)

"\u2190-\u21FF" + //：箭头 (Arrows)

"\u2200-\u22FF" + //：数学运算符 (Mathematical Operator)

"\u2300-\u23FF" + //：杂项工业符号 (Miscellaneous Technical)

"\u2400-\u243F" + //：控制图片 (Control Pictures)

"\u2440-\u245F" + //：光学识别符 (Optical Character Recognition)

"\u2460-\u24FF" + //：封闭式字母数字 (Enclosed Alphanumerics)

"\u2500-\u257F" + //：制表符 (Box Drawing)

"\u2580-\u259F" + //：方块元素 (Block Element)

"\u25A0-\u25FF" + //：几何图形 (Geometric Shapes)

"\u2600-\u26FF" + //：杂项符号 (Miscellaneous Symbols)

"\u2700-\u27BF" + //：印刷符号 (Dingbats)

"\u27C0-\u27EF" + //：杂项数学符号-A (Miscellaneous Mathematical//Symbols-A)

"\u27F0-\u27FF" + //：追加箭头-A (Supplemental Arrows-A)

"\u2800-\u28FF" + //：盲文点字模型 (Braille Patterns)

"\u2900-\u297F" + //：追加箭头-B (Supplemental Arrows-B)

"\u2980-\u29FF" + //：杂项数学符号-B (Miscellaneous Mathematical//Symbols-B)

"\u2A00-\u2AFF" + //：追加数学运算符 (Supplemental Mathematical Operator)

"\u2B00-\u2BFF" + //：杂项符号和箭头 (Miscellaneous Symbols and Arrows)

"\u2C00-\u2C5F" + //：格拉哥里字母 (Glagolitic)

"\u2C60-\u2C7F" + //：拉丁文扩展-C (Latin Extended-C)

"\u2C80-\u2CFF" + //：古埃及语 (Coptic)

"\u2D00-\u2D2F" + //：格鲁吉亚语补充 (Georgian Supplement)

"\u2D30-\u2D7F" + //：提非纳文 (Tifinagh)

"\u2D80-\u2DDF" + //：埃塞俄比亚语扩展 (Ethiopic Extended)

"\u2E00-\u2E7F" + //：追加标点 (Supplemental Punctuation)

"\u2E80-\u2EFF" + //：CJK 部首补充 (CJK Radicals Supplement)

"\u2F00-\u2FDF" + //：康熙字典部首 (Kangxi Radicals)

"\u2FF0-\u2FFF" + //：表意文字描述符 (Ideographic Description Characters)

"\u3000-\u303F" + //：CJK 符号和标点 (CJK Symbols and Punctuation)

"\u3040-\u309F" + //：日文平假名 (Hiragana)

"\u30A0-\u30FF" + //：日文片假名 (Katakana)

"\u3100-\u312F" + //：注音字母 (Bopomofo)

"\u3130-\u318F" + //：朝鲜文兼容字母 (Hangul Compatibility Jamo)

"\u3190-\u319F" + //：象形字注释标志 (Kanbun)

"\u31A0-\u31BF" + //：注音字母扩展 (Bopomofo Extended)

"\u31C0-\u31EF" + //：CJK 笔画 (CJK Strokes)

"\u31F0-\u31FF" + //：日文片假名语音扩展 (Katakana Phonetic Extensions)

"\u3200-\u32FF" + //：封闭式 CJK 文字和月份 (Enclosed CJK Letters and//Months)

"\u3300-\u33FF" + //：CJK 兼容 (CJK Compatibility)

"\u3400-\u4DBF" + //：CJK 统一表意符号扩展 A (CJK Unified Ideographs//Extension A)

"\u4DC0-\u4DFF" + //：易经六十四卦符号 (Yijing Hexagrams Symbols)

"\u4E00-\u9FBF" + //：CJK 统一表意符号 (CJK Unified Ideographs)

"\uA000-\uA48F" + //：彝文音节 (Yi Syllables)

"\uA490-\uA4CF" + //：彝文字根 (Yi Radicals)

"\uA500-\uA61F" + //：Vai

"\uA660-\uA6FF" + //：统一加拿大土著语音节补充 (Unified Canadian Aboriginal//Syllabics Supplement)

"\uA700-\uA71F" + //：声调修饰字母 (Modifier Tone Letters)

"\uA720-\uA7FF" + //：拉丁文扩展-D (Latin Extended-D)

"\uA800-\uA82F" + //：Syloti Nagri

"\uA840-\uA87F" + //：八思巴字 (Phags-pa)

"\uA880-\uA8DF" + //：Saurashtra

"\uA900-\uA97F" + //：爪哇语 (Javanese)

"\uA980-\uA9DF" + //：Chakma

"\uAA00-\uAA3F" + //：Varang Kshiti

"\uAA40-\uAA6F" + //：Sorang Sompeng

"\uAA80-\uAADF" + //：Newari

"\uAB00-\uAB5F" + //：越南傣语 (Vi?t Thái)

"\uAB80-\uABA0" + //：Kayah Li

"\uAC00-\uD7AF" + //：朝鲜文音节 (Hangul Syllables)//"\uD800-\uDBFF"+//：High-half zone of UTF-16//"\uDC00-\uDFFF"+//：Low-half zone of UTF-16

"\uE000-\uF8FF" + //：自行使用区域 (Private Use Zone)

"\uF900-\uFAFF" + //：CJK 兼容象形文字 (CJK Compatibility Ideographs)

"\uFB00-\uFB4F" + //：字母表达形式 (Alphabetic Presentation Form)

"\uFB50-\uFDFF" + //：阿拉伯表达形式A (Arabic Presentation Form-A)

"\uFE00-\uFE0F" + //：变量选择符 (Variation Selector)

"\uFE10-\uFE1F" + //：竖排形式 (Vertical Forms)

"\uFE20-\uFE2F" + //：组合用半符号 (Combining Half Marks)

"\uFE30-\uFE4F" + //：CJK 兼容形式 (CJK Compatibility Forms)

"\uFE50-\uFE6F" + //：小型变体形式 (Small Form Variants)

"\uFE70-\uFEFF" + //：阿拉伯表达形式B (Arabic Presentation Form-B)

"\uFF00-\uFFEF" + //：半型及全型形式 (Halfwidth and Fullwidth Form)

"\uFFF0-\uFFFF]";//：特殊 (Specials);

/*** 将字符串转成unicode

*@paramstr

* 待转字符串

*@returnunicode字符串*/

public staticString convert(String str) {

str= (str == null ? "": str);

String tmp;

StringBuffer sb= new StringBuffer(1000);charc;inti, j;

sb.setLength(0);for (i = 0; i < str.length(); i++) {

c=str.charAt(i);

sb.append("\\u");

j= (c >>> 8); //取出高8位

tmp =Integer.toHexString(j);if (tmp.length() == 1) {

sb.append("0");

}

sb.append(tmp);

j= (c & 0xFF); //取出低8位

tmp =Integer.toHexString(j);if (tmp.length() == 1) {

sb.append("0");

}

sb.append(tmp);

}return (newString(sb).toUpperCase());

}/*** 2)unicode转成字符串，与上述过程反向操作即可将unicode 字符串

*@paramstr

* 待转字符串

*@return普通字符串*/

public staticString revert(String str) {

str= (str == null ? "": str);if (str.indexOf("\\u") == -1)//如果不是unicode码则原样返回

returnstr;

StringBuffer sb= new StringBuffer(1000);for (int i = 0; i < str.length() - 6;) {

String strTemp= str.substring(i, i + 6);

String value= strTemp.substring(2);int c = 0;for (int j = 0; j < value.length(); j++) {char tempChar =value.charAt(j);int t = 0;switch(tempChar) {case 'a':

t= 10;break;case 'b':

t= 11;break;case 'c':

t= 12;break;case 'd':

t= 13;break;case 'e':

t= 14;break;case 'f':

t= 15;break;default:

t= tempChar - 48;break;

}

c+= t * ((int) Math.pow(16, (value.length() - j - 1)));

}

sb.append((char) c);

i= i + 6;

}returnsb.toString();

}/***

*@author: Simon

* @Description:传入需要过滤的字符

* @date: 2017年9月22日上午11:11:33

*@paramstring

*@return

public staticString emojiChange(String string) {//System.out.println("__________________________________");

try{//System.out.println("all-string:"+string);//System.out.println("all-unicode:"+convert(string));

Pattern pattern =Pattern.compile(unicodeReg);

StringBuffer sbBuffer= newStringBuffer();for (int i = 0; i < string.length(); i++) {char c =string.charAt(i);

String temp=String.valueOf(c);

Matcher matcher=pattern.matcher(temp);if(matcher.find()) {

sbBuffer.append(temp);

}else{

sbBuffer.append("□");

}//System.out.println("temp:"+temp+";unicode:"+convert(temp));

}//System.out.println("sb:"+sbBuffer.toString());//System.out.println("--------------------------------------");

returnsbBuffer.toString();

}catch(Exception e) {

e.printStackTrace();

}return "";

}

萨缪尔

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
复制链接

分享到 QQ

分享到新浪微博

扫一扫