方案一:使用reduceByKey
数据word.txt
张三
李四
王五
李四
王五
李四
王五
李四
王五
王五
李四
李四
李四
李四
李四
代码:
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.rdd.RDD;
import org.apache.spark.sql.SparkSession;
import scala.Tuple2;
public class HelloWord {
public static void main(String[] args) {
SparkSession spark = SparkSession.builder().master("local[*]").appName("Spark").getOrCreate();
final JavaSparkContext ctx = JavaSparkContext.fromSparkContext(spark.sparkContext());
RDD rdd = spark.sparkContext().textFile("C:\\Users\\boco\\Desktop\\word.txt", 1);
JavaRDD javaRDD = rdd.toJavaRDD();
JavaPairRDD javaRDDMap = javaRDD.mapToPair(new PairFunction() {
public Tuple2 call(String s) {
return new Tuple2(s, 1);
}
});
JavaPairRDD result = javaRDDMap.reduceByKey(new Function2() {
@Override
public Integer call(Integer integer, Integer integer2) throws Exception {
return integer + integer2;
}
});
System.out.println(result.collect());
}
}
输出:
[(张三,1), (李四,9), (王五,5)]
方案二:使用spark sql
使用spark sql实现代码:
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.StructField;
import org.apache.spark.sql.types.StructType;
import java.util.ArrayList;
public class HelloWord {
public static void main(String[] args) {
SparkSession spark = SparkSession.builder().master("local[*]").appName("Spark").getOrCreate();
final JavaSparkContext ctx = JavaSparkContext.fromSparkContext(spark.sparkContext());
JavaRDD rows = spark.read().text("C:\\Users\\boco\\Desktop\\word.txt").toJavaRDD();
ArrayList fields = new ArrayList();
StructField field = null;
field = DataTypes.createStructField("key", DataTypes.StringType, true);
fields.add(field);
StructType schema = DataTypes.createStructType(fields);
Dataset ds = spark.createDataFrame(rows, schema);
ds.createOrReplaceTempView("words");
Dataset result = spark.sql("select key,count(0) as key_count from words group by key");
result.show();
}
}
结果:
+---+---------+
|key|key_count|
+---+---------+
| 王五| 5|
| 李四| 9|
| 张三| 1|
+---+---------+