java
1.
public static void main(String[] args) {
SparkConf conf = new SparkConf();
conf.setMaster("local[*]");
conf.setAppName(_01WordCountJavaApp.class.getSimpleName());
JavaSparkContext jsc = new JavaSparkContext(conf);
JavaRDD<String> lines = jsc.textFile("file:///D:/hello.txt");
JavaRDD<String> wordsRDD = lines.flatMap(new FlatMapFunction<String, String>() {
@Override
public Iterator<String> call(String line) throws Exception {
return Arrays.asList(line.split(",|\\s+")).iterator();
}
});
JavaPairRDD<String, Integer> pairRDD = wordsRDD.mapToPair(new PairFunction<String, String, Integer>() {
@Override
public Tuple2<String, Integer> call(String word) throws Exception {
return new Tuple2<String, Integer>(word, 1);
}
});
JavaPairRDD<String, Integer> retRDD = pairRDD.reduceByKey(new Function2<Integer, Integer, Integer>() {
@Override
public Integer call(Integer v1, Integer v2) throws Exception {
return v1 + v2;
}
});
retRDD.foreach(new VoidFunction<Tuple2<String, Integer>>() {
@Override
public void call(Tuple2<String, Integer> t) throws Exception {
System.out.println(t._1 + "---->" + t._2);
}
});
jsc.stop();
}
2.
public static void main(String[] args) {
Logger.getLogger("org.apache.spark").setLevel(Level.WARNING);
Logger.getLogger("org.apache.hadoop").setLevel(Level.WARNING);
Logger.getLogger("org.spark_project").setLevel(Level.WARNING);
SparkConf conf = new SparkConf();
conf.setMaster("local[*]");
conf.setAppName(_02WordCountJavaApp.class.getSimpleName());
JavaSparkContext jsc = new JavaSparkContext(conf);
JavaRDD<String> lines = jsc.textFile("file:///D:/hello.txt");
JavaRDD<String> words = lines.flatMap(line -> Arrays.asList(line.split("\\s+")).iterator());
JavaPairRDD<String, Integer> pairsRDD = words.mapToPair(word -> new Tuple2<String, Integer>(word, 1));
JavaPairRDD<String, Integer> ret = pairsRDD.reduceByKey((v1, v2) -> v1 + v2);
ret.foreach(t -> System.out.println(t._1 + "---" + t._2));
jsc.stop();
}
Scala
var array = Array("a b c", "a b", "a")
array.map(x => x.split(" ")).flatten.map(x => (x, 1)).groupBy(x => x._1).map(x => (x._1, x._2.length))
array.flatMap(x => x.split(" ")).map(x => (x, 1)).groupBy(x => x._1).map(x => (x._1, x._2.length))
array.flatMap(x => x.split(" ")).groupBy(x => x).map(x => (x._1, x._2.length))
array.flatMap(x => x.split(" ")).groupBy(x => x).map(x => (x._1, x._2.length)).toList.sortBy(x => x._2).reverse
array.flatMap(x => x.split(" ")).groupBy(x => x).map(x => (x._1, x._2.length)).toList.sortWith((x, y) => x._2 > y._2)
array.flatMap(_.split(" ")).map((_, 1)).groupBy(_._1).mapValues(_.foldLeft(0)(_+_._2))
spark
1.
object _01WordCountScalaApp {
def main(args: Array[String]): Unit = {
val conf = new SparkConf()
.setAppName(s"${_01WordCountScalaApp.getClass.getSimpleName}")
.setMaster("local")
val sc = new SparkContext(conf)
val list = List(
"guo jing",
"huang rong",
"ou yang feng",
"hong qi gong",
"yang kang",
"yang guo "
)
val listRDD : RDD[String] = sc.parallelize(list)
val retRDD : RDD[(String,Int)] = listRDD.flatMap(_.split("\\s+")).map((_, 1)).reduceByKey(_+_)
retRDD.foreach(println)
sc.stop()
}
}
2.
/**
* 加载外部hdfs中的文件,进行计算
* java.net.UnknownHostException: ns1
* 将core-site.xml和hdfs-site.xml添加到classpath下面即可
* 此时再去读取本地文件,便会报文件路径找不到,因为有了这两文件的原因,
* 将自动将传入的路径解析为hdfs的路,后续的本地文件格式写法如下:
* file:///E:/data/hello.txt
*/
object _02WordCountApp {
def main(args: Array[String]): Unit = {
val conf = new SparkConf()
.setAppName(s"${_02WordCountApp.getClass.getSimpleName}")
.setMaster("local")
val sc = new SparkContext(conf)
val listRDD :RDD[String] = sc.textFile("hdfs://bd1901/data/hello")
val retRDD : RDD[(String,Int)] = listRDD.flatMap(_.split("\\s+")).map((_,1)).reduceByKey(_+_)
sc.stop()
}
}
3.
object _03WordCountRemoteApp {
def main(args: Array[String]): Unit = {
if(args == null || args.length<1){
println(
"""Parameter Errors! Usage:<inputpath> <sleep> <output>
| |inputpath: 程序输入参数
| |sleep: 休眠时间
| |output: 程序输出参数
""".stripMargin)
System.exit(-1)
}
val Array(inputpath,sleep,output) = args //模式匹配
val conf = new SparkConf()
.setAppName(s"${_03WordCountRemoteApp.getClass.getSimpleName}")
// .setMaster("local")
val sc = new SparkContext(conf)
val listRDD :RDD[String] = sc.textFile("inputpath")
val retRDD : RDD[(String,Int)] = listRDD.flatMap(_.split("\\s+")).map((_,1)).reduceByKey(_+_)
retRDD.collect()//将partition中的数据从executor拉回到driver中
Thread.sleep(sleep.toLong)
sc.stop()
}
}