Java 实现 URL Decode
背景:
ElasticSearch 自定义脚本 Painless 当中可以支持 Java 的部分 API,但 6.x 版本没有对 URLDecode 的支持(发现 7.0 已经支持),而分析用户搜索当中需要对 URL 进行 Decode
因而需要自己实现
package com.demo;
import org.junit.Test;
/**
* URL Decode 的实现原理
*
* 参考:
* java.net.URLDecoder.decode()
* https://zh.wikipedia.org/wiki/UTF-8 UTF-8
* https://blog.csdn.net/hezh1994/article/details/78899683 彻底弄懂 Unicode 编码
* https://blog.csdn.net/sinat_38816924/article/details/78438070
* https://blog.csdn.net/iteye_13222/article/details/82636048 java 中文字符串,utf-8编码为byte数组的计算过程
* https://blog.csdn.net/zx1749623383/article/details/79540748 Java编码和解码Unicode
* https://blog.csdn.net/e19901004/article/details/103880863 判断字符串中是否含有4字节字符(UTF8编码)
* https://www.cnblogs.com/weizhxa/p/12010890.html 特殊字符(包括emoji)梳理和UTF8编码解码原理
* https://blog.csdn.net/left_la/article/details/36188181 Unicode详解(UCS-2,UCS-4,UTF-8,UTF-16,UTF-32)
* http://www.fmddlmyy.cn/text6.html 谈谈Unicode编码,简要解释UCS、UTF、BMP、BOM等名词
*
* https://tool.lu/hexconvert/
* http://tool.chinaz.com/tools/urlencode.aspx bug
* https://design215.com/toolbox/utf8-4byte-characters.php
*/
public class UrlDecode {
public static void main(String[] args) {
// java.net.URLDecoder.decode()
String str = "/controller/action?&wd=%F0%9F%8D%80&s=%E9%9D%92%E5%B1%B1%E6%9C%AC%E4%B8%8D%E8%80%81%EF%BC%8C%E4%B8%BA%E9%9B%AA%E7%99%BD%E5%A4%B4%EF%BC%9B%E7%BB%BF%E6%B0%B4%E6%9C%AC%E6%97%A0%E5%BF%A7%EF%BC%8C%E5%9B%A0%E9%A3%8E%E7%9A%B1%E9%9D%A2&page=1&page_size=30";
String decodedStr = urldecode(str);
System.out.println(decodedStr); // /controller/action?&wd=🍀&s=青山本不老,为雪白头;绿水本无忧,因风皱面&page=1&page_size=30
}
public static String urldecode(String s) {
boolean needToChange = false;
int numChars = s.length();
StringBuilder sb = new StringBuilder();
int i = 0;
char c;
String vv = "+%";
byte vNum1 = (byte)vv.charAt(0);
byte vNum2 = (byte)vv.charAt(1);
while (i < numChars) {
c = s.charAt(i);
byte cNum = (byte)c;
if (cNum == vNum1) {
sb.append(' ');
i++;
needToChange = true;
} else if (cNum == vNum2) {
String hexString = "";
int countHex = 0;
int[] tmpBytes = null;
while (((i + 2) < numChars) && ((byte)c == vNum2)) {
int v = Integer.parseInt(s.substring(i + 1, i + 3), 16);
if (tmpBytes == null) {
tmpBytes = new int[4];
}
tmpBytes[countHex] = v;
int byteCount = 1;
int preBitNum = tmpBytes[0] >> 4; // ????xxxx
if (preBitNum >= 15) { // 1111
byteCount = 4;
} else if (preBitNum >= 14) { // 1110
byteCount = 3;
} else if (preBitNum >= 12) { // 110x
byteCount = 2;
}
hexString += s.substring(i + 1, i + 3);
countHex += 1;
if (byteCount == countHex) {
char result;
if (byteCount == 1) {
result = (char)v;
sb.append(result);
} else {
if (hexString.length() >= 8) {
int fourth = Integer.parseInt(hexString.substring(0, 2), 16);
int left = Integer.parseInt(hexString.substring(2, 8), 16);
// 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
int unicodeNum = ((fourth & 7) << 18) | ((left & 0x3f0000) >> 4) | ((left & 0x3f00) >> 2) | (left & 0x3f);
// https://www.cnblogs.com/weizhxa/p/12010890.html
// https://zh.wikipedia.org/wiki/UTF-16
// 0x10000: 1 0000 0000 0000 0000
// 0xD800: 1101 1000 0000 0000
// 0xDC00: 1101 1100 0000 0000
// 0x400: 100 0000 0000
int highBit = ((unicodeNum - 0x10000) >> 10) + 0xD800; // 上 10 位 + 0xD800 => 高位
int lowBit = (unicodeNum - 0x10000) % 0x400 + 0xDC00; // 下 10 位 + 0xDC00 => 低位
sb.append((char)highBit);
sb.append((char)lowBit);
} else {
int num = Integer.parseInt(hexString, 16);
if ((num & 0xe00000) > 0) { // 1110 0000 0000 0000 0000 0000
// 1110xxxx 10xxxxxx 10xxxxxx
result = (char)(((num & 0xf0000) >> 4) | ((num & 0x3f00) >> 2) | (num & 0x3f));
} else if ((num & 0xc000) > 0) { // 1100 0000 0000 0000
// 110xxxxx 10xxxxxx
result = (char)(((num & 0x1f00) >> 2) | (num & 0x3f));
} else {
// 0xxxxxxx
result = (char)(num & 127);
}
sb.append(result);
}
}
hexString = "";
countHex = 0;
}
i += 3;
if (i < numChars) {
c = s.charAt(i);
}
}
needToChange = true;
} else {
sb.append(c);
i++;
}
}
return needToChange ? sb.toString() : s;
}
}
Lucene Query
{
"size": 0,
"query": {
"bool": {
// 筛选条件
}
},
"aggs": {
"result": {
"terms": {
"script": {
"lang": "painless",
"size": 2,
"source": "def m = /^\\/controller\\/action.+?s=(.+?)&.*?$/.matcher(doc['nginx.access.url'].value);\nif (m.matches()) {\n String s = m.group(1);\n boolean needToChange = false;\n int numChars = s.length();\n StringBuilder sb = new StringBuilder();\n int i = 0;\n\n char c;\n byte[] bytes = null;\n\n String vv = \"+%\";\n byte vNum1 = (byte)vv.charAt(0);\n byte vNum2 = (byte)vv.charAt(1);\n while (i < numChars) {\n c = s.charAt(i);\n byte cNum = (byte)c;\n if (cNum == vNum1) {\n sb.append(' ');\n i++;\n needToChange = true;\n } else if (cNum == vNum2) {\n if (bytes == null) {\n bytes = new byte[(numChars - i) / 3];\n }\n int pos = 0;\n\n String hexString = \"\";\n int countHex = 0;\n while (((i + 2) < numChars) && ((byte)c == vNum2)) {\n int v = Integer.parseInt(s.substring(i + 1, i + 3), 16);\n hexString += s.substring(i + 1, i + 3);\n countHex += 1;\n if (3 == countHex) {\n int num = Integer.parseInt(hexString, 16);\n String bitString = Integer.toString(num, 2);\n String unicodeString = \"\";\n if ((num & 0xf0000000L) > 0) {\n unicodeString = bitString.substring(5, 8) + bitString.substring(10, 16) + bitString.substring(18, 24) + bitString.substring(26, bitString.length());\n } else if ((num & 0xe00000) > 0) {\n unicodeString = bitString.substring(4, 8) + bitString.substring(10, 16) + bitString.substring(18, bitString.length());\n } else if ((num & 0xc000) > 0) {\n unicodeString = bitString.substring(3, 8) + bitString.substring(10, bitString.length());\n } else {\n unicodeString = bitString.substring(1, bitString.length());\n }\n char result = (char)Integer.parseInt(unicodeString, 2);\n sb.append(result);\n\n hexString = \"\";\n countHex = 0;\n }\n bytes[pos++] = (byte)v;\n i += 3;\n if (i < numChars) {\n c = s.charAt(i);\n }\n }\n\n needToChange = true;\n } else {\n sb.append(c);\n i++;\n }\n }\n\n String ret = needToChange ? sb.toString() : s;\n return ret.toUpperCase();\n} else {\n return \"N/A\";\n}"
}
}
}
}
}
输出
{
“took”: 2600,
“timed_out”: false,
“_shards”: {
“total”: 278,
“successful”: 278,
“skipped”: 276,
“failed”: 0
},
“hits”: {
“total”: 476944,
“max_score”: 0,
“hits”: []
},
“aggregations”: {
“results”: {
“doc_count_error_upper_bound”: 1419,
“sum_other_doc_count”: 359784,
“buckets”: [
{
“key”: “秋以为期”,
“doc_count”: 6514
},
{
“key”: “原野苍茫”,
“doc_count”: 4704
}
]
}
}
}
参考
java.net.URLDecoder.decode()
方法(原有方法中new String()
构造函数 ElasticSearch 只支持无参的形式,因此需要手工进行从字节数组转为 Unicode 字符)- https://blog.csdn.net/hezh1994/article/details/78899683
- https://www.elastic.co/guide/en/elasticsearch/painless/6.7/painless-api-reference.html