filter
过滤器,顾名思义就是用来操作数据的算子
示例文件
- 在同级目录下有一个文件夹in,文件夹in下有一个sample.txt,内容如下
aa bb cc aa aa aa dd dd ee ee ee ee
ff aa bb zks
ee kks
ee zz zks
filter
- filter()接收一个函数,把这个函数用于RDD中的每一个元素,将满足函数结果作为结果RDD编程
Scala版本
- 找出sample.txt文件中包含zks的行的内容
package nj.zb.sparkstu
import org.apache.spark.{SparkConf, SparkContext}
object FilterScala {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setMaster("local[*]").setAppName("filterscala")
val sc = new SparkContext(conf)
val lines=sc.textFile("in/sample.txt")
val a=lines.filter(line=>line.contains("zks"))
a.collect.foreach(println)
}
}
结果展示:
Java版本
- 找出sample.txt文件中包含zks的行的内容
package nj.zb.sparkstu;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.Function;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List;
public class FilterJava {
public static void main(String[] args) {
SparkConf conf = new SparkConf().setMaster("local[*]").setAppName("filterapp");
JavaSparkContext sc = new JavaSparkContext(conf);
JavaRDD<String> lines = sc.textFile("in/sample.txt");
//filter
JavaRDD<String> filterRdd=lines.filter(new Function<String, Boolean>() {
@Override
public Boolean call(String v1) throws Exception {
return v1.contains("zks");
}
});
List<String> collect = filterRdd.collect();
for (String str:collect){
System.out.println(str);
}
}
}
- 结果展示:
map
- map()接收一个函数,把这个函数用于RDD中的每一个元素,将函数的返回结果作为结果RDD编程
- RDD中的对应元素的值map是一对一的关系
Scala版本
- 把smaple.txt文件的每一行变成一个数组
package nj.zb.sparkstu
import org.apache.spark.{SparkConf, SparkContext}
object FilterScala {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setMaster("local[*]").setAppName("filterscala")
val sc = new SparkContext(conf)
val lines=sc.textFile("in/sample.txt")
//val b=lines.map(line=>line.split("\\s+"))
//b.map(_.toList).collect.foreach(println)
val b=lines.map(line=>line.split("\\s+"))
b.collect.foreach(x=>x.foreach(println))
}
}
结果展示:
Java版本
- 把smaple.txt文件的每一行变成一个数组
package nj.zb.sparkstu;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.Function;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List;
public class FilterJava {
public static void main(String[] args) {
SparkConf conf = new SparkConf().setMaster("local[*]").setAppName("filterapp");
JavaSparkContext sc = new JavaSparkContext(conf);
JavaRDD<String> lines = sc.textFile("in/sample.txt");
//Map
JavaRDD<Iterable> mapRDD = lines.map(new Function<String, Iterable>() {
@Override
public Iterable call(String v1) throws Exception {
String[] split = v1.split(" ");
return Arrays.asList(split);
}
});
List<Iterable> collect = mapRDD.collect();
for (Iterable it : collect) {
Iterator iterator = it.iterator();
while (iterator.hasNext()) {
System.out.println(iterator.next());
}
}
结果展示:
flatMap
- flatMap()函数应用于每一个元素,对于每一个元素返回的是多个元素组成的迭代器
Scala版本
- 把数据切分为单词
package nj.zb.sparkstu
import org.apache.spark.{SparkConf, SparkContext}
object FilterScala {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setMaster("local[*]").setAppName("filterscala")
val sc = new SparkContext(conf)
val lines=sc.textFile("in/sample.txt")
val c=lines.flatMap(line=>line.split(" "))
c.collect.foreach(println)
}
}
结果展示:
Java版本,Spark2.0以上
- Spark2.0以上,对flatMap的方法有所修改,就是flatMap中的Iter和Iteratable的区别
package nj.zb.sparkstu;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.Function;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List;
public class FilterJava {
public static void main(String[] args) {
SparkConf conf = new SparkConf().setMaster("local[*]").setAppName("filterapp");
JavaSparkContext sc = new JavaSparkContext(conf);
JavaRDD<String> lines = sc.textFile("in/sample.txt");
//flatmap
JavaRDD<String> flatMapRdd = lines.flatMap(new FlatMapFunction<String, String>() {
@Override
public Iterator<String> call(String s) throws Exception {
String[] split = s.split("\\s+");
return Arrays.asList(split).iterator();
}
});
List<String> collect = flatMapRdd.collect();
for (String str :
collect) {
System.out.println(str);
}
}
}
结果展示: