下面介绍sparkML中常用的算法与demo的使用方式,使用方式提供javaApi
pom依赖
这里使用的版本spark为2.0.0 scala为2.11可以根据自己的需求调整
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-mllib_2.11</artifactId>
<version>2.0.0</version>
</dependency>
<dependency>
<groupId>org.scala-lang</groupId>
<artifactId>scala-library</artifactId>
<version>2.11.12</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
<version>2.0.0</version>
</dependency>
支持向量机算法
import org.apache.spark.SparkContext;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.mllib.linalg.DenseVector;
import org.apache.spark.mllib.linalg.Vector;
import org.apache.spark.sql.SparkSession;
import scala.Tuple2;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.mllib.classification.SVMModel;
import org.apache.spark.mllib.classification.SVMWithSGD;
import org.apache.spark.mllib.regression.LabeledPoint;
import java.util.ArrayList;
import java.util.List;
/**
* spark 支持向量机
* svm为2分类
* 数据的标签标签为0或者1
* 识别的结果为离散数值,更靠近的分类为识别的结果,不清除阈值会导致生成的结果为0或者1,误差较大
*/
public class SVM {
public static void main(String[] args) {
SparkSession spark = SparkSession.builder().appName("test").master("local[2]").getOrCreate();
//构造原始假数据数据
List<LabeledPoint> labeledPoints = new ArrayList<>();
for (int i = 0; i < 100000; i++) {//svm为2分类 数值标签为0或者1,
Vector vector1 = new DenseVector(new double[]{getRandom(0D, 0.1), getRandom(0D, 0.1)});
LabeledPoint labeledPoint1 = new LabeledPoint(0, vector1);//设置标签为0
Vector vector2 = new DenseVector(new double[]{getRandom(0.9D, 1.0), getRandom(0.9D, 1.0)});
LabeledPoint labeledPoint2 = new LabeledPoint(1, vector2);//设置标签为1
labeledPoints.add(labeledPoint1);
labeledPoints.add(labeledPoint2);
}
SparkContext sc = spark.sparkContext();
sc.setLogLevel("ERROR");//关闭日志
JavaSparkContext jsc = JavaSparkContext.fromSparkContext(sc);
JavaRDD<LabeledPoint> data = jsc.parallelize(labeledPoints);
//将初始RDD分为两个 [60%训练数据,40%测试数据]
JavaRDD<LabeledPoint>[] javaRDDS = data.randomSplit(new double[]{0.6, 0.4}, 11L);
JavaRDD<LabeledPoint> train = javaRDDS[0];
JavaRDD<LabeledPoint> test = javaRDDS[0];
int numIterations = 10000;//迭代次数
SVMModel model = SVMWithSGD.train(train.rdd(), numIterations);//model需要广播到每一台服务器中,这里本地调试忽略
model.clearThreshold();//清除默认阈值,这里不清楚默认阈值会导致数据偏差过大
JavaRDD<Tuple2<Double, Double>> scoreAndLabels = test.map(p -> new Tuple2<>(model.predict(p.features()), p.label())); //计算测试集上的原始分数。
long count = scoreAndLabels.count();
Double reduce = scoreAndLabels.map(f -> Math.abs(f._1 - f._2)).reduce((Function2<Double, Double, Double>) (v1, v2) -> v1 + v2);
System.out.println("误差" + reduce/count);
for (LabeledPoint labeledPoint : test.take(10)) {
System.out.println(labeledPoint.features() + ":" + labeledPoint.label() + "=>" + model.predict(labeledPoint.features()));
}
}
/**
* 传入随机数变化范围获取随机数
*
* @param min 随机数最小是
* @param max 随机数最大值
* @return
*/
public static Double getRandom(Double min, Double max) {
return Math.random() * (max - min) + min;
}
}
线性回归
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.ForeachFunction;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.ml.linalg.SparseVector;
import org.apache.spark.ml.linalg.Vector;
import org.apache.spark.ml.regression.LinearRegression;
import org.apache.spark.ml.regression.LinearRegressionModel;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.RowFactory;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.StructField;
import org.apache.spark.sql.types.StructType;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
/**
* 线性回归
*/
public class Regression {
public static void main(String[] args) {
SparkSession spark = SparkSession.builder().appName("test").config("spark.sql.warehouse.dir", "D:\\code\\eye\\subject\\src\\test\\cache").master("local[1]").getOrCreate();
List<String> dataList = getDataList();
JavaSparkContext jsc = JavaSparkContext.fromSparkContext(spark.sparkContext());
jsc.setLogLevel("ERROR");
JavaRDD<String> value = jsc.parallelize(dataList);//2.数据转化为JavaRDD,线上环境直接使用jsc读取HDFS文件得到JavaRDD
JavaRDD<Row> rows = getRowJavaRDD(value);//将JavaRDD中的String转化为Row
StructType schema = getStructType();//获取JavaRDD转化DataSet的转化规则
Dataset<Row> data = spark.createDataFrame(rows, schema);//转化为DataSet
LinearRegression lr = new LinearRegression().setMaxIter(10000).setRegParam(0.1).setElasticNetParam(0.2);
LinearRegressionModel lrModel = lr.fit(data);
getProbability(data, lrModel);//使用训练数据测试模型,获取训练的精度
}
/**
* 假数据
* 数据格式为 标签,特征,特征
* 数据格式可以自行修改,改完格式之后需要将解析方式一并修改
*
* @return
*/
private static List<String> getDataList() {
//y=5*x-4
List<String> myList1 = Arrays.asList("-2,-14", "-1,-9", "0,-4");
List<String> myList2 = Arrays.asList("1,1", "2,6", "3,11");
List<String> myList = new ArrayList<>();
myList.addAll(myList1);
myList.addAll(myList2);
return myList;
}
/**
* 将JavaRDD中的String格式化为Row,并提取将特征与标签
*
* @param value
* @return
*/
private static JavaRDD<Row> getRowJavaRDD(JavaRDD<String> value) {
return value.map((Function<String, Row>) line -> {//3.将String转化为Row类型数据,用于后续训练数据(训练数据api的输入为DataSet<Row>)
String[] parts = line.split(",");
Double label = Double.parseDouble(parts[0]);//标签定义
Double one = Double.parseDouble(parts[1]) + (Math.random() - 0.5)*0.2;//特征定义,特征可以定义多个,这里值定义了两个
//将特征合并成特征向量,感知器的api中数据输入格式里面的特征要求是Vector类型,所以这里需要转化
Vector sparseVector = new SparseVector(2, new int[]{0}, new double[]{one});//参数解释 1:特征字段数组,2:特征字段下标集合,3:特征字段值数组
return RowFactory.create(label, sparseVector);//创建Row(标签,特征向量)
});
}
/**
* 定义JavaRD转化DataSet的转化规则
*
* @return
*/
private static StructType getStructType() {
ArrayList<StructField> fields = new ArrayList<>();//使用schema生成方案用于JavaRDD转化为DataSet
fields.add(DataTypes.createStructField("label", DataTypes.DoubleType, true));//定义字段类型为标签类型
fields.add(DataTypes.createStructField("features", org.apache.spark.ml.linalg.SQLDataTypes.VectorType(), true));//定义字段类型为特征类型
return DataTypes.createStructType(fields);
}
/**
* 校验训练识别结果误差
*
* @param data
* @param lrModel
* @return
*/
private static void getProbability(Dataset<Row> data, LinearRegressionModel lrModel) {
data.foreach((ForeachFunction<Row>) row -> {
Vector vector = (Vector) row.get(1);
double predict = lrModel.predict(vector);
System.out.println("原始结果:" + row.get(0) + ", 训练得到的结果:" + predict);
});
}
}