impala C++ urldecode udf 函数
涉及到知识盲区,犯了一个挺严重的错误,创建了一个impala c++ udf,逻辑使用了Assert,触发此函数导致好几台的impalad进程结束
原因:impala底层是用C++实现,触发Assert(0),相当于结束进程,大家慎用!!!
日志错误如下,然后impalad就挂了
UrlDecode.h
#ifndef UDF_DECODE_H
#define UDF_DECODE_H
#include "udf/udf.h"
using namespace impala_udf;
std::string UrlDecode(const std::string& str);
StringVal UrlDecodeFunc(FunctionContext* context, const StringVal& input);
#endif
UrlDecode.cc
#include "string"
#include "UrlDecode.h"
#include <iostream>
#include <sstream>
#include <cassert>
#include <iomanip>
using namespace std;
unsigned char ToHex(unsigned char x)
{
return x > 9 ? x + 55 : x + 48;
}
unsigned char FromHex(unsigned char x)
{
unsigned char y;
if (x >= 'A' && x <= 'Z') y = x - 'A' + 10;
else if (x >= 'a' && x <= 'z') y = x - 'a' + 10;
else if (x >= '0' && x <= '9') y = x - '0';
else assert(0); //导致impalad进程挂掉
return y;
}
std::string UrlDecode(const std::string& str)
{
std::string strTemp = "";
size_t length = str.length();
for (size_t i = 0; i < length; i++)
{
if (str[i] == '+') strTemp += ' ';
else if (str[i] == '%')
{
assert(i + 2 < length);
unsigned char high = FromHex((unsigned char)str[++i]);
unsigned char low = FromHex((unsigned char)str[++i]);
strTemp += high*16 + low;
}
else strTemp += str[i];
}
return strTemp;
}
StringVal UrlDecodeFunc(FunctionContext* context, const StringVal& input) {
if (input.is_null)
return StringVal::null();
std::string output = UrlDecode(std::string(reinterpret_cast<const char*>(input.ptr), input.len));
StringVal result(context, output.size());
memcpy(result.ptr, output.data(), output.size());
return result;
}
不使用断言,优化后:
#include "string"
#include "UrlDecode.h"
#include <iostream>
#include <sstream>
#include <cassert>
#include <iomanip>
using namespace std;
unsigned char ToHex(unsigned char x)
{
return x > 9 ? x + 55 : x + 48;
}
int FromHex(unsigned char x)
{
unsigned char y;
if (x >= 'A' && x <= 'Z') y = x - 'A' + 10;
else if (x >= 'a' && x <= 'z') y = x - 'a' + 10;
else if (x >= '0' && x <= '9') y = x - '0';
else return -1;
}
std::string UrlDecode(const std::string& str)
{
std::string strTemp = "";
size_t length = str.length();
for (size_t i = 0; i < length; i++)
{
if (str[i] == '+') strTemp += ' ';
else if (str[i] == '%' && i + 2 < length){
int high = FromHex(str[++i]);
int low = FromHex(str[++i]);
if(high != -1 && low != -1) {
strTemp += high*16 + low;
}
}
else strTemp += str[i];
}
return strTemp;
}
StringVal UrlDecodeFunc(FunctionContext* context, const StringVal& input) {
if (input.is_null)
return StringVal::null();
std::string output = UrlDecode(std::string(reinterpret_cast<const char*>(input.ptr), input.len));
StringVal result(context, output.size());
memcpy(result.ptr, output.data(), output.size());
return result;
}
impala创建udf
# 环境
1.gcc、g++ 4.8.5 20150623,使用此版本,高版本会报错
2.cmake version 3.22.2
3.boost、boost-devel
# impala-udf
git clone https://github.com/laserson/impala-udf-devel
cd impala-udf-devel-master
修改文件:CMakeLists.txt,改成自己的 .cc 文件
# 执行 cmake . && make,函数就编译完了
[root@hdfs1 impala-udf-devel-master]# cmake .
CMake Warning (dev) in CMakeLists.txt:
No project() command is present. The top-level CMakeLists.txt file must
contain a literal, direct call to the project() command. Add a line of
code such as
project(ProjectName)
near the top of the file, but after cmake_minimum_required().
CMake is pretending there is a "project(Project)" command on the first
line.
This warning is for project developers. Use -Wno-dev to suppress it.
CMake Deprecation Warning at CMakeLists.txt:15 (cmake_minimum_required):
Compatibility with CMake < 2.8.12 will be removed from a future version of
CMake.
Update the VERSION argument <min> value or use a ...<max> suffix to tell
CMake that the project does not need compatibility with older versions.
-- Found clang executable: CLANG_EXECUTABLE-NOTFOUND
-- Configuring done
-- Generating done
-- Build files have been written to: /opt/module/impala_udf/impala-udf-devel-master
[root@hdfs1 impala-udf-devel-master]# make
Consolidate compiler generated dependencies of target UrlDecode
[100%] Built target UrlDecode
[root@hdfs1 impala-udf-devel-master]#
# 将 build 下的 libUrlDecode.so 上传到hdfs
# 删除 创建 并执行 udf函数
[hdfs1:21050] default> drop function urldecode(string);
Query: drop function urldecode(string)
+----------------------------+
| summary |
+----------------------------+
| Function has been dropped. |
+----------------------------+
Fetched 1 row(s) in 5.89s
[hdfs1:21050] default> create function UrlDecode(string) returns string location '/impala-udf/libUrlDecode.so' symbol='UrlDecodeFunc';
Query: create function UrlDecode(string) returns string location '/impala-udf/libUrlDecode.so' symbol='UrlDecodeFunc'
+----------------------------+
| summary |
+----------------------------+
| Function has been created. |
+----------------------------+
Fetched 1 row(s) in 5.27s
[hdfs1:21050] default> select urldecode("%E4%BD%A0%E5%A5%BD");
Query: select urldecode("%E4%BD%A0%E5%A5%BD")
Query submitted at: 2024-01-26 23:59:34 (Coordinator: http://hdfs1:25000)
Query progress can be monitored at: http://hdfs1:25000/query_plan?query_id=1d4fedadf9ee3a46:35e8bcf800000000
+----------------------------------------------------------+
| default.urldecode('%e4%bd%a0%e5%a5%bd') /* native udf */ |
+----------------------------------------------------------+
| 你好 |
+----------------------------------------------------------+
Fetched 1 row(s) in 0.23s
[hdfs1:21050] default>