java操作
package com.jh.java;
import java.util.Arrays;
import java.util.List;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.rdd.RDD;
public class My_version1
{
public static void main(String[] args)
{
//告诉hadoop在哪,也可不写,但前提是配置有hadoop的环境变量
System.setProperty("hadoop.home.dir", "G:\\untar\\hadoop-3.2.1\\hadoop-3.2.1")
//spark的配置信息
SparkConf sparkConf = new SparkConf();
//给application起个名字
sparkConf.setAppName("My_java_spark");
//运行的模式,是本地模式
sparkConf.setMaster("local[*]");
//初始化spark
JavaSparkContext javaSparkContext = new JavaSparkContext(sparkConf);
//test01(javaSparkContext);
test02(javaSparkContext);
//关闭资源
javaSparkContext.stop();
}
public static void test01(JavaSparkContext sc )
{
System.out.println("java-test01");
// //准备集合
// List<Integer> asList = Arrays.asList(14,56,89,42,56,33);
// 转换成Rdd
// JavaRDD<Integer> javaRDD = sc.parallelize(asList);
//
// System.out.println(javaRDD);
// //遍历
// javaRDD.foreach(t ->System.out.println(t));
//准备集合
List<Integer> asList = Arrays.asList(14,56,89,42,56,33);
//后面的数字是分区数
JavaRDD<Integer> javaRDD = sc.parallelize(asList,2);
System.out.println(javaRDD);
javaRDD.foreachPartition(t->{
//查看类型,是迭代器
System.out.println("asd"+t.getClass().getSimpleName());
//遍历每个分区的元素
t.forEachRemaining(f->System.out.print(f + ","));
System.out.println("end");
});
//分区数
System.out.println("num:" + javaRDD.getNumPartitions());
}
public static void test02(JavaSparkContext sc)
{
System.out.println("java-test02");
String path = "E:\\eclipse201912_daima\\dev31th\\test01";
//分两区,转换成Rdd
JavaRDD<String> textFile = sc.textFile(path,2);
//转换成集合
List<String> collect = textFile.collect();
//查看元素
System.out.println(collect);
//查看分区数
System.out.println("num" + textFile.getNumPartitions());
String path1="C:\\\\Users\\\\admin\\\\Desktop\\\\2";
//将数据存储到磁盘中
textFile.saveAsTextFile(path1);
System.out.println("GG");
}
}
scala操作
package com.jh.scala
import org.apache.spark.SparkContext
import org.apache.spark.SparkConf
import java.util.Arrays
object Teacher_version1 {
def main(args: Array[String]): Unit = {
//告诉hadoop在哪,也可不写,但前提是配置有hadoop的环境变量
System.setProperty("hadoop.home.dir", "G:\\untar\\hadoop-3.2.1\\hadoop-3.2.1")
var conf = new SparkConf();
//设置大哥
conf.setMaster("local[*]")
//给这个application起个名字
conf.setAppName("Teacher-version")
var sc = new SparkContext(conf)
test01(sc)
//test02(sc)
sc.stop()
}
def test01(c:SparkContext){
println("test01")
var arr = Array(10,20,30,40)
//创建一个Rdd
var arrRdd = c.parallelize(arr)
//查看其中元素
var collectArr = arrRdd.collect();
// println("collectArr:" + Arrays.toString(collectArr))
//循环遍历元素,都可以
// arradd.foreach(println)
// arradd.foreach(println(_))
// arradd.foreach(t=>println(t))
println("分区")
//后面的参数是分几个区
var arrParaRdd = c.parallelize(arr, 2)
//查看元素
collectArr = arrParaRdd.collect()
println("collectArr:" + Arrays.toString(collectArr))
//已分区集合查看其分区
arrParaRdd.foreachPartition(f=>
{//查看类型
println("111" + f.getClass.getSimpleName)
f.foreach(t=>{
print("循环" + t + "\t")
})
println("end")
}
)
//查看分区数
println("num:" + arrParaRdd.getNumPartitions)
}
def test02(sc:SparkContext){
//文件路径
var path = "E:\\eclipse201912_daima\\dev31th\\test01"
//参数:文件路径,分区数
var textFileRdd = sc.textFile(path,2)
println("textFileRdd:" + textFileRdd)
//查看分区数量
println("getNumPartitions" + textFileRdd.getNumPartitions)
//收集元素为数组
var textArr = textFileRdd.collect()
println("textArr" + textArr)
textFileRdd.foreach(println)
var tarPath = "C:\\Users\\admin\\Desktop\\1"
//将rdd存储到磁盘上,存储的是文本文件
textFileRdd.saveAsTextFile(tarPath)
}
}