wordcount代码(spark,flink_streaming,flink_batch)

最新推荐文章于 2022-07-11 18:21:49 发布

武念

最新推荐文章于 2022-07-11 18:21:49 发布

阅读量108

点赞数

分类专栏：大数据系列文章标签： scala spark big data

本文链接：https://blog.csdn.net/weixin_43813200/article/details/120031394

版权

大数据系列专栏收录该内容

24 篇文章 2 订阅

订阅专栏

简单通过scala实现wordcount

wordcount简介

大数据的基础项目，类似其他语言的hello world，必学项目。统计文章中词语的个数。

示例1

一个简单的实现word,基于spark使用语言java方式呈现。

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.api.java.function.VoidFunction;
import scala.Tuple2;

import java.util.Arrays;

public class wordCount {
    public static void main(String[] args) throws Exception {
        SparkConf conf = new SparkConf().setMaster("local").setAppName("wordCount");
        JavaSparkContext jsc = new JavaSparkContext(conf);

        JavaRDD<String> rdd = jsc.textFile("D:\\Java_code\\BigDataLogTest\\src\\main\\resources\\nginx.log");

        JavaRDD<String> rdd1 = rdd.flatMap(new FlatMapFunction<String, String>() {
            @Override
            public Iterable<String> call(String s) throws Exception {
                String[] s1 = s.split(" ");
                return Arrays.asList(s1);
            }
        });
        JavaPairRDD<String, Integer> map = rdd1.mapToPair(new PairFunction<String, String, Integer>() {
            @Override
            public Tuple2<String, Integer> call(String s) throws Exception {
                return new Tuple2<String,Integer>(s,1);
            }
        });
        JavaPairRDD<String, Integer> stringIntegerJavaPairRDD = map.reduceByKey(new Function2<Integer, Integer, Integer>() {
            @Override
            public Integer call(Integer v1, Integer v2) throws Exception {
                return v1 + v2;
            }
        });
        stringIntegerJavaPairRDD.foreach(new VoidFunction<Tuple2<String, Integer>>() {
            @Override
            public void call(Tuple2<String, Integer> stringIntegerTuple2) throws Exception {
                System.out.println(stringIntegerTuple2);
            }
        });
        jsc.stop();
    }
}

示例2

一个简单的实现word,基于spark使用语言scala方式呈现。

// split + flatmap 版本
// split + map + reduceByKey 版本
// split + map + reduceByKey 简写版本
package com.xxx.spark
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}

object WorkCount extends App{
  private val conf = new SparkConf()
  conf.setMaster("local").setAppName("wc")
  private val sparkContext = new SparkContext(conf)
  private val file: String = WorkCount.getClass.getClassLoader.getResource("data/wc.txt").getFile
  // RDD 是spark的核心 hello nihao,RDD核心 数据操作
  private val line: RDD[String] = sparkContext.textFile(file)
  // 数据转换
  // 通过空格来分割数据
  private val word: RDD[String] = line.flatMap(x => x.split(" "))
  // 添值 string -> (string , 1) 举例 "aaa" -> ("aaa",1) 
  private val wordcount: RDD[(String, Int)] = word.map(x => (x, 1))
  // 根据key 统计总数  即 ("aaa",1) key为"aaa" 
  private val result: RDD[(String, Int)] = wordcount.reduceByKey((x, y) => {
    x + y
  })
  // 枚举每个key,并打印结果
  result.foreach(println)
  // 简单过程 同上面的分割数据+添值+统计+枚举打印
  line.flatMap(_.split(" ")).map((_,1)).reduceByKey(_+_).foreach(println)
  // 遍历过程
  word.foreach(println)
  sparkContext.stop()
}

示例3

// flink stream 版本
public class StreamingJob {
    public static void main(String[] args) throws Exception {
        // set up the streaming execution environment
        final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        DataStreamSource<String> source = env.readTextFile("data/nginx.log");
        source
                .flatMap(new FlatMapFunction<String, Tuple2<String, Integer>>() {
                    @Override
                    public void flatMap(String s, Collector<Tuple2<String, Integer>> collector) throws Exception {
                        String[] s1 = s.split(" ");
                        for (String s2 : s1) {
                            collector.collect(Tuple2.of(s2, 1));
                        }
                    }
                })
                .keyBy(0)
                .sum(1)
                .print();

        // lambda 方式
        lambdaFunc(source);
        // common 普通方式
        commonFunc(source);
        // rich 富函数方式 可以开后门
        richFunc(source);
        // process 过程函数方式，可以调用env
        processFunc(source);

        // execute program
        env.execute("Flink Streaming Java API Skeleton");
    }

    private static void processFunc(DataStreamSource<String> source) {
        source
                .process(new ProcessFunction<String, Tuple2<String,Integer>>() {
            @Override
            public void processElement(String value, ProcessFunction<String, Tuple2<String, Integer>>.Context ctx, Collector<Tuple2<String, Integer>> out) throws Exception {
                String[] s = value.split(" ");
                for (String s1 : s) {
                    out.collect(Tuple2.of(s1,1));
                }
            }
        })
                .keyBy(0)
                .sum(1)
                .print();
    }

    private static void richFunc(DataStreamSource<String> source) {
        source
                .flatMap(new RichFlatMapFunction<String, Tuple2<String, Integer>>() {
                    @Override
                    public void flatMap(String s, Collector<Tuple2<String, Integer>> collector) throws Exception {
                        String[] s1 = s.split(" ");
                        for (String s2 : s1) {
                            collector.collect(Tuple2.of(s2, 1));
                        }
                    }
                })
                .keyBy(0)
                .sum(0)
                .print();
    }

    private static void commonFunc(DataStreamSource<String> source) {
        source.flatMap(new FlatMapFunction<String, Tuple2<String, Integer>>() {
                    @Override
                    public void flatMap(String s, Collector<Tuple2<String, Integer>> collector) throws Exception {
                        String[] s1 = s.split(" ");
                        for (String s2 : s1) {
                            collector.collect(Tuple2.of(s2, 1));
                        }
                    }
                })
                .keyBy(0)
                .sum(1)
                .print();
    }

    private static void lambdaFunc(DataStreamSource<String> source) {
        SingleOutputStreamOperator<String> stringSingleOutputStreamOperator = source.flatMap(new FlatMapFunction<String, String>() {
            @Override
            public void flatMap(String s, Collector<String> collector) throws Exception {
                String[] s1 = s.split(" ");
                for (String s2 : s1) {
                    collector.collect(s2);
                }
            }
        });
        SingleOutputStreamOperator<Tuple2<String, Integer>> map = stringSingleOutputStreamOperator.map(new MapFunction<String, Tuple2<String, Integer>>() {
            @Override
            public Tuple2<String, Integer> map(String s) throws Exception {
                return Tuple2.of(s, 1);
            }
        });
        KeyedStream<Tuple2<String, Integer>, Tuple> tuple2TupleKeyedStream = map.keyBy(0);
        SingleOutputStreamOperator<Tuple2<String, Integer>> sum = tuple2TupleKeyedStream.sum(1);
        DataStreamSink<Tuple2<String, Integer>> print = sum.print();
    }
}

示例4

// flink batch 版本
package org.example;

import org.apache.flink.api.common.functions.FlatMapFunction;
import org.apache.flink.api.common.functions.ReduceFunction;
import org.apache.flink.api.java.ExecutionEnvironment;
import org.apache.flink.api.java.operators.DataSource;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.util.Collector;

public class BatchJob {

	public static void main(String[] args) throws Exception {
		// set up the batch execution environment
		final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();

		DataSource<String> stringDataSource = env.readTextFile("data/nginx.log");
		stringDataSource
				.flatMap(new FlatMapFunction<String, Tuple2<String,Integer>>() {
			@Override
			public void flatMap(String s, Collector<Tuple2<String, Integer>> collector) throws Exception {
				String[] s1 = s.split(" ");
				for (String s2 : s1) {
					collector.collect(Tuple2.of(s2,1));
				}
			}
		})
				.groupBy(0)
				.sum(1)
				.print();

		stringDataSource
				.flatMap(new FlatMapFunction<String, Tuple2<String,Integer>>() {
					@Override
					public void flatMap(String s, Collector<Tuple2<String, Integer>> collector) throws Exception {
						String[] s1 = s.split(" ");
						for (String s2 : s1) {
							collector.collect(Tuple2.of(s2,1));
						}
					}
				})
				.groupBy(0)
				.reduce(new ReduceFunction<Tuple2<String, Integer>>() {
					@Override
					public Tuple2<String, Integer> reduce(Tuple2<String, Integer> stringIntegerTuple2, Tuple2<String, Integer> t1) throws Exception {
						return Tuple2.of(t1.f0,t1.f1+stringIntegerTuple2.f1);
					}
				})
				.print();

//		env.execute("Flink Batch Java API Skeleton");
	}
}