能保证顺序
使用案例: spark2.4.0开始支持
官方描述:
Returns an array of the elements in array1 but not in array2,
without duplicates.
> SELECT array_except(array(1, 2, 3), array(1, 3, 5));
[2]
该函数对应的源码
@transient lazy val evalExcept: (ArrayData, ArrayData) => ArrayData = {
if (TypeUtils.typeWithProperEquals(elementType)) {
(array1, array2) =>
val hs = new OpenHashSet[Any]
var notFoundNullElement = true
var i = 0
while (i < array2.numElements()) {
if (array2.isNullAt(i)) {
notFoundNullElement = false
} else {
val elem = array2.get(i, elementType)
hs.add(elem)
}
i += 1
}
val arrayBuffer = new scala.collection.mutable.ArrayBuffer[Any]
i = 0
while (i < array1.numElements()) {
if (array1.isNullAt(i)) {
if (notFoundNullElement) {
arrayBuffer += null
notFoundNullElement = false
}
} else {
val elem = array1.get(i, elementType)
if (!hs.contains(elem)) {
arrayBuffer += elem
hs.add(elem)
}
}
i += 1
}
new GenericArrayData(arrayBuffer)
} else {
(array1, array2) =>
val arrayBuffer = new scala.collection.mutable.ArrayBuffer[Any]
var scannedNullElements = false
var i = 0
while (i < array1.numElements()) {
var found = false
val elem1 = array1.get(i, elementType)
if (elem1 == null) {
if (!scannedNullElements) {
var j = 0
while (!found && j < array2.numElements()) {
found = array2.isNullAt(j)
j += 1
}
// array2 is scanned only once for null element
scannedNullElements = true
} else {
found = true
}
} else {
var j = 0
while (!found && j < array2.numElements()) {
val elem2 = array2.get(j, elementType)
if (elem2 != null) {
found = ordering.equiv(elem1, elem2)
}
j += 1
}
if (!found) {
// check whether elem1 is already stored in arrayBuffer
var k = 0
while (!found && k < arrayBuffer.size) {
val va = arrayBuffer(k)
found = (va != null) && ordering.equiv(va, elem1)
k += 1
}
}
}
if (!found) {
arrayBuffer += elem1
}
i += 1
}
new GenericArrayData(arrayBuffer)
}
}
从源码可见顺序是有保证的
注意事项
他会去重
spark-sql> select array_except(`array`(11,22,33,11,22,44),array(22));
[11,33,44]
源码中可见 如果arr1的元素不在arr2中 然后会判断他是否在结果集中 如果在,就不会新增到结果数组中.
关于null
spark-sql> select array_except(`array`(11,22,33,"23",22,44,null),array("44"));
["11","22","33","23",null]
spark-sql> select array_except(`array`(11,22,33,"23",22,44),array("44",null));
["11","22","33","23"]
Time taken: 0.082 seconds, Fetched 1 row(s)
spark-sql> select array_except(`array`(11,22,33,"23",22,44,null),array("44",null));
["11","22","33","23"]
数组内类型不同怎么搞
spark-sql> select array_except(`array`(11,22,33,"23",22,44),array("44"));
["11","22","33","23"]
下图e1 e2无疑就是 string,string. 所以选出来肯定是string.
那么左右各自通用类型不同怎么办? 报错