KafkaStream——Spark对Kafka的数据进行清洗(java语言编写)

雷神乐乐

已于 2023-04-25 14:30:37 修改

阅读量531

点赞数

分类专栏： # Kafka # Spark 文章标签： kafka spark 大数据

于 2023-04-24 16:02:01 首次发布

本文链接：https://blog.csdn.net/Helen_1997_1997/article/details/130343191

版权

Spark 同时被 2 个专栏收录

15 篇文章 0 订阅

订阅专栏

Kafka

5 篇文章 0 订阅

订阅专栏

2.查看对应kafka的topic——user_friends

四、javaSpark清洗数据(二)

1.代码编写

2.查看对应kafka的topic——event_attendees

一、添加依赖

 <properties>
    <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
    <maven.compiler.source>1.8</maven.compiler.source>
    <maven.compiler.target>1.8</maven.compiler.target>
  </properties>

  <dependencies>
    <dependency>
      <groupId>junit</groupId>
      <artifactId>junit</artifactId>
      <version>4.11</version>
      <scope>test</scope>
    </dependency>
    <!-- https://mvnrepository.com/artifact/org.apache.kafka/kafka-clients -->
    <dependency>
      <groupId>org.apache.kafka</groupId>
      <artifactId>kafka-clients</artifactId>
      <version>2.8.0</version>
    </dependency>
    <!-- https://mvnrepository.com/artifact/org.apache.kafka/kafka -->
    <dependency>
      <groupId>org.apache.kafka</groupId>
      <artifactId>kafka_2.12</artifactId>
      <version>2.8.0</version>
    </dependency>
    <dependency>
      <groupId>org.apache.kafka</groupId>
      <artifactId>kafka-streams</artifactId>
      <version>2.8.0</version>
    </dependency>
  </dependencies>

  <build>
    <pluginManagement><!-- lock down plugins versions to avoid using Maven defaults (may be moved to parent pom) -->
      <plugins>
        <!-- clean lifecycle, see https://maven.apache.org/ref/current/maven-core/lifecycles.html#clean_Lifecycle -->
        <plugin>
          <artifactId>maven-clean-plugin</artifactId>
          <version>3.1.0</version>
        </plugin>
        <!-- default lifecycle, jar packaging: see https://maven.apache.org/ref/current/maven-core/default-bindings.html#Plugin_bindings_for_jar_packaging -->
        <plugin>
          <artifactId>maven-resources-plugin</artifactId>
          <version>3.0.2</version>
        </plugin>
        <plugin>
          <artifactId>maven-compiler-plugin</artifactId>
          <version>3.8.0</version>
        </plugin>
        <plugin>
          <artifactId>maven-surefire-plugin</artifactId>
          <version>2.22.1</version>
        </plugin>
        <plugin>
          <artifactId>maven-jar-plugin</artifactId>
          <version>3.0.2</version>
        </plugin>
        <plugin>
          <artifactId>maven-install-plugin</artifactId>
          <version>2.5.2</version>
        </plugin>
        <plugin>
          <artifactId>maven-deploy-plugin</artifactId>
          <version>2.8.2</version>
        </plugin>
        <!-- site lifecycle, see https://maven.apache.org/ref/current/maven-core/lifecycles.html#site_Lifecycle -->
        <plugin>
          <artifactId>maven-site-plugin</artifactId>
          <version>3.7.1</version>
        </plugin>
        <plugin>
          <artifactId>maven-project-info-reports-plugin</artifactId>
          <version>3.0.0</version>
        </plugin>
      </plugins>
    </pluginManagement>
      <plugins>
          <plugin>
              <groupId>org.apache.maven.plugins</groupId>
              <artifactId>maven-compiler-plugin</artifactId>
              <configuration>
                  <source>8</source>
                  <target>8</target>
              </configuration>
          </plugin>
      </plugins>
  </build>

二、main下配置log4j

# Set everything to be logged to the console
log4j.rootCategory=ERROR, console
log4j.appender.console=org.apache.log4j.ConsoleAppender
log4j.appender.console.target=System.err
log4j.appender.console.layout=org.apache.log4j.PatternLayout
log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n

# Set the default spark-shell log level to WARN. When running the spark-shell, the
# log level for this class is used to overwrite the root logger's log level, so that
# the user can have different defaults for the shell and regular Spark apps.
log4j.logger.org.apache.spark.repl.Main=WARN

# Settings to quiet third party logs that are too verbose
log4j.logger.org.spark_project.jetty=WARN
log4j.logger.org.spark_project.jetty.util.component.AbstractLifeCycle=ERROR
log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=INFO
log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=INFO

# SPARK-9183: Settings to avoid annoying messages when looking up nonexistent UDFs in SparkSQL with Hive support
log4j.logger.org.apache.hadoop.hive.metastore.RetryingHMSHandler=FATAL
log4j.logger.org.apache.hadoop.hive.ql.exec.FunctionRegistry=ERROR

三、javaSpark清洗数据(一)

1.代码编写

import org.apache.kafka.clients.consumer.ConsumerConfig;
import org.apache.kafka.common.serialization.Serdes;
import org.apache.kafka.streams.*;

import java.util.ArrayList;
import java.util.Properties;

/**
 * 清洗前：
 * user      ,friends
 * 3197468391,1346449342 3873244116 4226080662 1222907620
 *
 * 清洗后：
 * user      ,friends   目标：topic:user_friends_raw
 * 3197468391,1346449342
 * 3197468391,3873244116
 * 3197468391,4226080662
 * 3197468391,1222907620
 */

public class UserFriendStream {
    public static void main(String[] args) {
        Properties prop = new Properties();
        prop.put(StreamsConfig.APPLICATION_ID_CONFIG, "userfriends1");
        prop.put(StreamsConfig.BOOTSTRAP_SERVERS_CONFIG, "lxm147:9092");
        prop.put(ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG, "false");// false 手动提交 true 自动提交(需要频繁更换消费者)
        prop.put(StreamsConfig.COMMIT_INTERVAL_MS_CONFIG, 1000);
        // earliest latest none
        prop.put(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG, "earliest");
        prop.put(StreamsConfig.DEFAULT_KEY_SERDE_CLASS_CONFIG, Serdes.String().getClass());
        prop.put(StreamsConfig.DEFAULT_VALUE_SERDE_CLASS_CONFIG, Serdes.String().getClass());

        StreamsBuilder builder = new StreamsBuilder();
        // TODO 主要的业务逻辑处理 start
        builder.stream("user_friends_raw")
                .flatMap((key, value) -> {// value = 3197468391,1346449342 3873244116 4226080662 1222907620
                    ArrayList<KeyValue<String, String>> list = new ArrayList<>();
                    String[] fields = value.toString().split(",");
                    if (fields.length == 2) {
                        String userID = fields[0];
                        String[] friends = fields[1].split("\\s+");
                        for (String friendID :
                                friends) {
                            System.out.println(userID + "\t" + friendID);//3197468391,1346449342
                            KeyValue<String, String> kv = new KeyValue<>(null, userID + "," + friendID);
                            list.add(kv);
                        }
                    }
                    return list;
                })
                .to("user_friends");

        // TODO 主要的业务逻辑处理 end
        Topology topo = builder.build();
        KafkaStreams streams = new KafkaStreams(topo, prop);
        streams.start();
    }
}

2.查看对应kafka的topic——user_friends

[root@lxm147 config]# kafka-run-class.sh kafka.tools.GetOffsetShell --broker-list lxm147:9092 --topic user_friends
user_friends:0:30386403

四、javaSpark清洗数据(二)

1.代码编写

import org.apache.kafka.clients.consumer.ConsumerConfig;
import org.apache.kafka.common.serialization.Serdes;
import org.apache.kafka.streams.*;

import java.util.ArrayList;
import java.util.Properties;

/**
 * 清洗前：
 * event     ,  yes               ,              maybe              ,invited               ,no
 * 1159822043,1975964455 3973364512,2733420590 1350834692 1324909047,1723091036 3795873583,3575574655 1077296663
 * 
 *      event,friendid,status
 * 1159822043,1975964455,yes
 * 1159822043,3973364512,yes
 * 1159822043,2733420590,maybe
 * 1159822043,1350834692,maybe
 * 1159822043,1723091036,invited               
 * 1159822043,3575574655,no
 */
public class EventAttendStream {
    public static void main(String[] args) {
        Properties prop = new Properties();
        prop.put(StreamsConfig.APPLICATION_ID_CONFIG, "userfriends");
        prop.put(StreamsConfig.BOOTSTRAP_SERVERS_CONFIG, "lxm147:9092");
        prop.put(ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG, "false");// false 手动提交 true 自动提交(需要频繁更换消费者)
        prop.put(StreamsConfig.COMMIT_INTERVAL_MS_CONFIG, 1000);
        // earliest latest none
        prop.put(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG, "earliest");
        prop.put(StreamsConfig.DEFAULT_KEY_SERDE_CLASS_CONFIG, Serdes.String().getClass());
        prop.put(StreamsConfig.DEFAULT_VALUE_SERDE_CLASS_CONFIG, Serdes.String().getClass());
        StreamsBuilder builder = new StreamsBuilder();

        // TODO 业务逻辑处理 start
        builder.stream("event_attendees_raw")
                .flatMap((key, value) -> {
                    ArrayList<KeyValue<String, String>> list = new ArrayList<>();
                    String[] fields = value.toString().split(",");
                    String eventID = fields[0];
                    if (eventID.trim().length() > 0) {
                        if(fields.length>=2){
                            String[] yesID = fields[1].split("\\s+");
                            for (String y :
                                    yesID) {
                                System.out.println(eventID+"\t"+y+" yes");
                                list.add(new KeyValue<>(null,eventID+","+y+",yes"));
                            }
                        }if(fields.length>=3){
                            String[] maybeID = fields[2].split("\\s+");
                            for (String y :
                                    maybeID) {
                                System.out.println(eventID+"\t"+y+" maybeID");
                                list.add(new KeyValue<>(null,eventID+","+y+",maybeID"));
                            }
                        }if(fields.length>=4){
                            String[] invitedID = fields[3].split("\\s+");
                            for (String y :
                                    invitedID) {
                                System.out.println(eventID+"\t"+y+" invitedID");
                                list.add(new KeyValue<>(null,eventID+","+y+",invitedID"));
                            }
                        }if(fields.length>=5){// 判断no的位置是否有值
                            String[] noID = fields[4].split("\\s+");
                            for (String y :
                                    noID) {
                                System.out.println(eventID+"\t"+y+" noID");
                                list.add(new KeyValue<>(null,eventID+","+y+",noID"));
                            }
                        }
                    }
                    return list;
                })
                .to("event_attendees");
        // TODO 主要的业务逻辑处理 end
        Topology topo = builder.build();
        KafkaStreams streams = new KafkaStreams(topo, prop);
        streams.start();
    }
}

2.查看对应kafka的topic——event_attendees

[root@lxm147 config]# kafka-run-class.sh kafka.tools.GetOffsetShell --broker-list lxm147:9092 --topic event_attendees
event_attendees:0:11247092

雷神乐乐

关注

0
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
KafkaStream——Spark对Kafka的数据进行清洗(java语言编写)

Spark对Kafka的数据进行清洗(java语言编写)——实现KafkaStream
复制链接

扫一扫

专栏目录

KafkaStream——Spark对Kafka的数据进行清洗(java语言编写)

一、添加依赖

二、main下配置log4j

三、javaSpark清洗数据(一)

1.代码编写

2.查看对应kafka的topic——user_friends

四、javaSpark清洗数据(二)

1.代码编写

2.查看对应kafka的topic——event_attendees

“相关推荐”对你有帮助么？