1. 本地运行代码
注意点:spark2.2.0-FlatMapFunction中call返回类型从Iterable变成了Iterator
package cn.spark.study.core;
import java.util.Arrays;
import java.util.Iterator;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.api.java.function.VoidFunction;
import scala.Tuple2;
public class WordCountLocal {
public static void main(String[] args) {
SparkConf conf =new SparkConf().setAppName("WordCountLocal").setMaster("local");
JavaSparkContext sc =new JavaSparkContext(conf);
JavaRDD<String> lines=sc.textFile("spark.txt");
JavaRDD<String> words = lines.flatMap(new FlatMapFunction<String,String>(){
/**
*
*/
private static final long serialVersionUID = 1L;
public Iterator<String> call(String line) throws Exception {
return Arrays.asList(line.split(" ")).iterator();
}
});
JavaPairRDD<String,Integer> pairs =words.mapToPair(new PairFunction<String,String,Integer>(){
/**
*
*/
private static final long serialVersionUID = 1L;
public Tuple2<String, Integer> call(String word) throws Exception {
return new Tuple2<String,Integer>(word,1);
}
});
JavaPairRDD<String,Integer> wordCounts =pairs.reduceByKey(new Function2<Integer,Integer,Integer>(){
/**
*
*/
private static final long serialVersionUID = 1L;
public Integer call(Integer v1, Integer v2) throws Exception {
return v1+v2;
}
});
wordCounts.foreach(new VoidFunction<Tuple2<String,Integer>>(){
/**
*
*/
private static final long serialVersionUID = 1L;
public void call(Tuple2<String, Integer> wordcount) throws Exception {
System.out.println(wordcount._1+" appeared "+wordcount._2+"times");
}
});
sc.close();
}
}
2.通过spark-submit 在Cluster运行
(1)上传文件到Hadoop集群上 hadoop fs -put spark.txt
(2)
package cn.spark.study.core;
import java.util.Arrays;
import java.util.Iterator;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.api.java.function.VoidFunction;
import scala.Tuple2;
public class WordCountCluster {
public static void main(String[] args) {
SparkConf conf =new SparkConf()
.setAppName("WordCountCluster");
JavaSparkContext sc =new JavaSparkContext(conf);
JavaRDD<String> lines=sc.textFile("hdfs://172.16.2.235:9000/user/root/README.md");
JavaRDD<String> words = lines.flatMap(new FlatMapFunction<String,String>(){
/**
*
*/
private static final long serialVersionUID = 1L;
public Iterator<String> call(String line) throws Exception {
return Arrays.asList(line.split(" ")).iterator();
}
});
JavaPairRDD<String,Integer> pairs =words.mapToPair(new PairFunction<String,String,Integer>(){
/**
*
*/
private static final long serialVersionUID = 1L;
public Tuple2<String, Integer> call(String word) throws Exception {
return new Tuple2<String,Integer>(word,1);
}
});
JavaPairRDD<String,Integer> wordCounts =pairs.reduceByKey(new Function2<Integer,Integer,Integer>(){
/**
*
*/
private static final long serialVersionUID = 1L;
public Integer call(Integer v1, Integer v2) throws Exception {
return v1+v2;
}
});
wordCounts.foreach(new VoidFunction<Tuple2<String,Integer>>(){
/**
*
*/
private static final long serialVersionUID = 1L;
public void call(Tuple2<String, Integer> wordcount) throws Exception {
System.out.println(wordcount._1+" appeared "+wordcount._2+"times");
}
});
sc.close();
}
}
(3) 使用maven插件打包
右键项目->RunAs->Run Configuration->MavenBuild ->右键->New
Name :spark-study-java
Base directory:项目根目录
Goals :clean package
Apply-> Run
(4)将target 目录下的spark-study-java-0.0.1-SNAPSHOT-jar-with-dependencies.jar传到集群上
(5)写一个脚本word.sh
/opt/spark/bin/spark-submit \
--class cn.spark.sparktest.core.WordCountCluster \
--num-executors 3 \
--driver-memory 1G \
--executor-memory 1G \
--executor-cores 1 \
/root/SparkJava/spark-study-java-0.0.1-SNAPSHOT-jar-with-dependencies.jar\
第一行: spark-submit命令的位置
第二行:WordCountCluster 的全类名
第三行 :num-executors集群节点个数
第四行: dirver内存
第五行 节点内存
第六行 节点的核数
第六行 打包文件的位置
./word.sh运行若不能运行 chmod 777 word.sh给权限