参考致谢:
https://www.bilibili.com/video/BV1Xz4y1m7cv?from=search&seid=4155013264152072680&spm_id_from=333.337.0.0
一、使用Java语言开发sparkstreaming完成WordCount
package Test;
import org.apache.spark.SparkConf;
import org.apache.spark.streaming.Durations;
import org.apache.spark.streaming.api.java.JavaPairDStream;
import org.apache.spark.streaming.api.java.JavaReceiverInputDStream;
import org.apache.spark.streaming.api.java.JavaStreamingContext;
import scala.Tuple2;
import java.util.Arrays;
public class Test31 {
public static void main(String[] args) throws InterruptedException {
SparkConf sparkConf = new SparkConf().setAppName("JavaSparkDemo").setMaster("local[*]");
JavaStreamingContext jssc = new JavaStreamingContext(sparkConf, Durations.seconds(5));
JavaReceiverInputDStream<String> lines = jssc.socketTextStream("node1", 9999);
JavaPairDStream<String, Integer> result = lines.flatMap(line -> Arrays.asList(line.split(",", -1)).iterator())
.mapToPair(word -> new Tuple2<>(word, 1))
.reduceByKey((a, b) -> a + b);
result.print();
jssc.start();
jssc.awaitTermination();
}
}
二、使用Java语言开发SqarkSQL完成WordCount
package TTest;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SparkSession;
import static org.apache.spark.sql.functions.col;
import java.util.Arrays;
public class TTest32 {
public static void main(String[] args) throws InterruptedException {
SparkSession spark = SparkSession.builder().appName("JavaSparkDemo").master("local[*]").getOrCreate();
spark.sparkContext().setLogLevel("Warn");
Dataset<String> ds = spark.read().textFile("");
Dataset<String> wordsDS = ds.flatMap((String line) -> Arrays.asList(line.split(" ")).iterator(), Encoders.STRING());
wordsDS.createOrReplaceGlobalTempView("t_word");
String sql="select value,count(*) as counts"+
"from t_word"+
"group by value"+
"order by counts desc";
spark.sql(sql).show();
wordsDS.groupBy("value")
.count()
.orderBy(col("count").desc()).show();
spark.stop();
}
}
三、使用Java语言开发SparkMlLib-线性回归算法-房价预测案例
package TTest;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.streaming.StreamingQueryException;
public class TTest34 {
public static void main(String[] args) throws InterruptedException, StreamingQueryException {
SparkSession spark = SparkSession.builder()
.appName("JavaSparkDemo")
.master("local[*]")
.getOrCreate();
spark.sparkContext().setLogLevel("WARN");
Dataset<Row> homedata = spark.read()
.format("csv")
.option("sep", "|")
.option("header", "true")
.option("inferSchema", "true")
.load("TTest/homeprice.data");
homedata.printSchema();
homedata.show();
Dataset<Row> featuredDF = homedata.select("sqFt", "age", "ares", "price");
new VectorAssembler
Dataset<Row>[] arr=vectorDF.randomSplit(new double[]{0.8,0.2},100);
Dataset<Row> trainSet=arr[0];
Dataset<Row> testSet=arr[1];
LinearRegressionModel model=new LinearRegression()
.setFeaturesCol("features")
.setLabelCol("price")
.setPredictionCol("predict_price")
.setMaxIter(10)
.fit(trainSet);
Dataset<Row> testResult=model.transform(testSet);
testResult.show();
double rmse=RegressionEvaluator evaluator=new RegressionEvaluator()
.setMetricName("rmse")
.setLabelCol("price")
.setPredictionCol("predict_price")
.evaluator.evaluate(testResult);
System.out.println("rmse为:"+rmse);
spark.stop();
}
}