一 RDD
1. 可以通过2种方式建立RDD:
- 读取外部数据集
- 在驱动程序里分发驱动器程序中的对象集合如List或者set
2. RDD支持两种类型的操作:转化和行动操作
- 转化操作: 如filter(),会生成一个新的RDD
- 行动操作: 会对RDD计算出一个结果,并把结果返回到驱动程序中
- RDD.persist() 会将数据的一部分读入内存,并反复查询
3. spark程序的工作方式
- 从外部数据创建出输入RDD
- 使用诸如filter()等操作进行转化,定义新的RDD
- 告诉spark对需要被重用的中间结果RDD进行persist()caoz
- 使用行动操作(如count()和first())来触发一次并行计算,sprak会对计算进行优化后在执行
4. 相关操作
- 创建RDD
读取外部数据集
读取已有的sparkContext集合
package com.tripleone.spark;
import org.apache.commons.lang3.StringUtils;
import org.apache.spark.api.java.*;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.rdd.RDD;
import scala.Int;
import scala.Serializable;
import scala.Tuple2;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List;
public class SimpleApp {
/**
* 就是统计单词a 和 b出现的行数
*/
public static void countLines(String filePath) {
//创建一个spark context
SparkConf conf = new SparkConf().setAppName("Simple Application");
JavaSparkContext sc = new JavaSparkContext(conf);
//读取输入数据
JavaRDD<String> logData = sc.textFile(filePath).cache();
long numAs = logData.filter(new Function<String, Boolean>() {
public Boolean call(String s) {
return s.contains("a");
}
}).count();
long numBs = logData.filter(new Function<String, Boolean>() {
public Boolean call(String s) {
return s.contains("b");
}
}).count();
System.out.println("Lines with a: " + numAs + ", lines with b: " + numBs);
sc.stop();
}
/**
* 统计单词个数
*
* @param filePath
*/
public static void countWords(String filePath, String outputFile) {
//创建一个spark context
SparkConf conf = new SparkConf().setAppName("wordCount");
JavaSparkContext sparkContext = new JavaSparkContext(conf);
//读取输入数据
JavaRDD<String> input = sparkContext.textFile(filePath);
//切分为单词
JavaRDD<String> words = input.flatMap(
new FlatMapFunction<String, String>() {
@Override
public Iterator<String> call(String s) throws Exception {
Iterator<String> list = Arrays.asList(s.split(" ")).iterator();
return list;
}
}
);
//转换为键值对并计数
JavaPairRDD<String, Integer> counts = words.mapToPair(
new PairFunction<String, String, Integer>() {
@Override
public Tuple2<String, Integer> call(String s) throws Exception {
return new Tuple2<>(s, 1);
}
}
).reduceByKey(new Function2<Integer, Integer, Integer>() {
@Override
public Integer call(Integer integer, Integer integer2) throws Exception {
return integer + integer2;
}
});
//将统计出来的单词书存入文本文件,引发求值
counts.saveAsTextFile(outputFile);
}
/**
* 从头开始学spark
*/
public static void learnSpark(String filePath) {
SparkConf conf = new SparkConf().setAppName("the first sparkApp");
JavaSparkContext sc = new JavaSparkContext(conf);
//RDD创建:1. 从外部读取
final JavaRDD<String> input = sc.textFile(filePath);
//RDD创建:1. 从已有集合中获取
JavaRDD<String> lines = sc.parallelize(Arrays.asList("pandas", "i like pandas"));
//转化操作: 用java实现filter()实现转化操作
JavaRDD<String> errorRDD = input.filter(
new Function<String, Boolean>() {
@Override
public Boolean call(String s) throws Exception {
return s.contains("error");
}
}
);
// RDD<String> errors = lines.filter(s -> s.contains("error")); // java8
//行动操作: 对错误进行计数
// take获取少量元素
// collect获取整个数据集的元素
System.out.println("Input had " + errorRDD.count() + " error lines");
System.out.println("Here are 10 examples: ");
for (String line : errorRDD.take(10)) { // 用take获取少量元素
System.out.println(line);
}
//计算各值的平方
JavaRDD<Integer> rdd = sc.parallelize(Arrays.asList(1,2,3,4));
JavaRDD<Integer> result = rdd.map(new Function<Integer, Integer>() {
@Override
public Integer call(Integer integer) throws Exception {
return integer*integer;
}
});
System.out.println(StringUtils.join(result.collect(),","));//这里导包时注意下,common3的包
//利用flatMap()将行数据切分为单词
JavaRDD<String> lineData = sc.parallelize(Arrays.asList("hello word ","hi"));
JavaRDD<String > words = lineData.flatMap(new FlatMapFunction<String, String>() {
@Override
public Iterator<String> call(String s) throws Exception {
return Arrays.asList(s.split(" ")).iterator();
}
});
System.out.println(words.first());
//reduce操作 计算总和
JavaRDD<Integer> numRDD = sc.parallelize(Arrays.asList(1,2,3,4,5,6));
Integer sum = numRDD.reduce(new Function2<Integer, Integer, Integer>() {
@Override
public Integer call(Integer x, Integer y) throws Exception {
return x+y;
}
});
System.out.println("-----------总和为:"+sum);
//aggregate计算平均值
JavaRDD<Integer> avgRDD = sc.parallelize(Arrays.asList(1,2,3,4,5,6));
Function2<AvgCount, Integer,AvgCount> addAndCount =
new Function2<AvgCount, Integer, AvgCount>() {
@Override
public AvgCount call(AvgCount avgCount, Integer integer) throws Exception {
avgCount.total += integer;
avgCount.num += 1;
return avgCount;
}
};
//用于将累加器合并
Function2<AvgCount, AvgCount, AvgCount> combine =
new Function2<AvgCount, AvgCount, AvgCount>() {
@Override
public AvgCount call(AvgCount avgCount, AvgCount avgCount2) throws Exception {
avgCount.total +=avgCount2.total;
avgCount.num += avgCount2.num;
return avgCount;
}
};
AvgCount initial = new AvgCount(0,0);
AvgCount rt = avgRDD.aggregate(initial, addAndCount, combine);
System.out.println(rt.avg());
}
public static void main(String[] args) {
String path = System.getProperty("user.dir");
System.out.println("path: " + path);
String logFile = "./data/README.md"; // Should be some file on your system
// countLines(logFile);
String wordPath = "./data/words.txt";
String out = "./data/output";
// countWords(wordPath,out);
learnSpark(wordPath);
}
}
class AvgCount implements Serializable{
public int total;
public int num;
public AvgCount(int total, int num){
this.total = total;
this.num = num;
}
public double avg(){
return total/(double)num;
}
}