spark streaming 实现 wordcount 实时统计

最新推荐文章于 2023-12-20 16:46:04 发布

weixin_33910434

最新推荐文章于 2023-12-20 16:46:04 发布

阅读量271

点赞数

文章标签：大数据 java

原文链接：http://www.cnblogs.com/orangefish-zz/p/6651266.html

版权

1、pom.xml:

　　注意：此处spark-streaming的依赖试了很多版本，都不好用，最终调试成功的为下面代码中所使用的版本。

    <dependencies>

        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-streaming_2.10</artifactId>
            <version>1.5.2</version>
        </dependency>

    </dependencies>

    <build>
        <plugins>
            <plugin>
                <groupId>org.apache.maven.plugins</groupId>
                <artifactId>maven-shade-plugin</artifactId>
                <version>2.3</version>
                <executions>
                    <execution>
                        <phase>package</phase>
                        <goals>
                            <goal>shade</goal>
                        </goals>
                        <configuration>
                            <filters>
                                <filter>
                                    <artifact>*:*</artifact>
                                    <excludes>
                                        <exclude>META-INF/*.SF</exclude>
                                        <exclude>META-INF/*.DSA</exclude>
                                    </excludes>
                                </filter>
                            </filters>
                            <transformers>
                                <transformer
                                    implementation="org.apache.maven.plugins.shade.resource.AppendingTransformer">
                                    <resource>META-INF/spring.handlers</resource>
                                </transformer>
                                <transformer
                                    implementation="org.apache.maven.plugins.shade.resource.AppendingTransformer">
                                    <resource>META-INF/spring.schemas</resource>
                                </transformer>
                                <transformer
                                    implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
                                    <mainClass></mainClass>
                                </transformer>
                            </transformers>
                            <createDependencyReducedPom>JavaStreaming</createDependencyReducedPom>
                        </configuration>
                    </execution>
                </executions>
            </plugin>
        </plugins>
    </build>

View Code

2、spark streaming 实现 wordCount On Line:

import java.util.Arrays;

import org.apache.spark.*;
import org.apache.spark.api.java.function.*;
import org.apache.spark.streaming.*;
import org.apache.spark.streaming.api.java.*;
import scala.Tuple2;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;


public class JavaStreaming {
    private static final Logger logger = LoggerFactory.getLogger(JavaStreaming.class);

    public static void main(String[] args) throws InterruptedException {
        
        logger.info("start..");
        
        SparkConf conf = new SparkConf().setAppName("wordCountOnline");
        
        JavaStreamingContext jssc = new JavaStreamingContext(conf, Durations.seconds(10)); ;
        JavaReceiverInputDStream<String> lines = jssc.socketTextStream(ip地址, 端口号);        //创建Spark Streaming输入数据来源
        lines.count().print();
        
        //遍历每一行，将每一行分割单词返回String的Iterable
        System.out.println("words FlatMapFunction");
        JavaDStream<String> words = lines.flatMap(new FlatMapFunction<String, String>() {
            private static final long serialVersionUID = 1L;

            public Iterable<String> call(String line) {
                return Arrays.asList(line.split(" "));
            }
        });    
        
        
        //每个单词计数标为1
        JavaPairDStream<String, Integer> pairs = words.mapToPair(new PairFunction<String, String, Integer>() {
            private static final long serialVersionUID = 1L;
            
            public Tuple2<String, Integer> call(String word) throws Exception {
                return new Tuple2<String, Integer>(word, 1);
            }
        }); 
        
        //相同单词的计数标记相加
        JavaPairDStream<String, Integer> word_count = pairs.reduceByKey(new Function2<Integer, Integer, Integer>() {
            private static final long serialVersionUID = 1L;
            
            public Integer call(Integer v1, Integer v2) {
                return v1 + v2;
            }
        });
        
        word_count.count().print();
        word_count.print();
        
        jssc.start();
        jssc.awaitTermination();
    }
}