采用传统的Spark SQL编写sql语句循环多次按列查询来实现效率太低,如是采用基本的WordCount统计单词的思想,“columnName+value”即“列名:值”作为唯一单词,用特殊分隔符隔开,遍历一次即可计算所需的所有值。代码如下:
def getStatistics(data: DataFrame):
(java.util.HashMap[String, Long], java.util.HashMap[String, Long], java.util.HashMap[String, Long]) = {
val colUnique = new java.util.HashMap[String, Long] //唯一值
val colMissing = new java.util.HashMap[String, Long] //缺失值
val colSingle = new java.util.HashMap[String, Long] //单值
val allColArr = data.columns
val dtypes = data.dtypes
val colSize = allColArr.size
val separator = "_0_" //分隔符
val len = separator.length
val rddHandle = data.rdd.map( row => {
val str:StringBuilder = new StringBuilder
for (i <- 0 to colSize - 1) {
if (row.get(i) == null) {
str.append(dtypes(i)._1 + ":" + separator)