上代码
package com.fandatsys.dams.core.profiling.types;
import com.fandatsys.dams.core.profiling.DatasetColumnChartingProfileType;
import com.fandatsys.dams.core.profiling.ProfilingType;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SQLContext;
import org.apache.spark.sql.types.DataType;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.DecimalType;
import org.rosuda.JRI.Rengine;
import java.util.List;
/**
* Data Length Frequency Distribution.
*/
@ProfilingType(name = "Hist")
public class ProfileTypeHist implements DatasetColumnChartingProfileType {
@Override
public void readMysql(Dataset<Row> dataset, String columnName) {
JavaSparkContext sparkContext = new JavaSparkContext(new SparkConf().setAppName("ProfileTypePie"));
SQLContext sqlContext = new SQLContext(sparkContext);
Rengine engine = new Rengine(null, false, null);
dataset.registerTempTable("dataset");
Dataset<Row> sqlhist = sqlContext.sql("select " + columnName + " from dataset");
List<Row> countList = sqlhist.collectAsList();
Object[] arrayCount1 = countList.toArray();
double[] arrayCount2 = new double[arrayCount1.length];
for (int i = 0; i < arrayCount1.length; i++) {
arrayCount2[i] = Double.parseDouble(arrayCount1[i].toString());
}
engine.assign(columnName, arrayCount2);
// Chart path
engine.eval("jpeg('/Charting/Hist.jpg')");
engine.eval("hist(" + columnName + ")");
engine.eval("dev.off()");
engine.end();
}
}
首先传递两个参数dataset和columnName,dataset是Sparksql中Dataset的实例化对象,columnName是我要操作的数据库表的某一列列名,首先我们需要导入JRI.jar包,该包中提供了Java调用R的方法,我们只需要做到将结果以字符串或者数组的形式作为参数提供给R,作为源数据
engine.assign(columnName, arrayCount2);
这一步是将数组通过传参的方式赋值给一个变量,可以所以起一个名字,我将名字起成了columnName,就是字段名,方便传参
engine.eval(“jpeg(’/Charting/Hist.jpg’)”);
这一步是为了将图片以jpg的形式保存到一个目录下
engine.eval(“hist(” + columnName + “)”);
这一步是进行画图,我画的是一个频数直方图
engine.eval(“dev.off()”);
engine.end();
最后关闭,退出啦