目录
2.查看对应kafka的topic——user_friends
2.查看对应kafka的topic——event_attendees
一、添加依赖
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<maven.compiler.source>1.8</maven.compiler.source>
<maven.compiler.target>1.8</maven.compiler.target>
</properties>
<dependencies>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.11</version>
<scope>test</scope>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.kafka/kafka-clients -->
<dependency>
<groupId>org.apache.kafka</groupId>
<artifactId>kafka-clients</artifactId>
<version>2.8.0</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.kafka/kafka -->
<dependency>
<groupId>org.apache.kafka</groupId>
<artifactId>kafka_2.12</artifactId>
<version>2.8.0</version>
</dependency>
<dependency>
<groupId>org.apache.kafka</groupId>
<artifactId>kafka-streams</artifactId>
<version>2.8.0</version>
</dependency>
</dependencies>
<build>
<pluginManagement><!-- lock down plugins versions to avoid using Maven defaults (may be moved to parent pom) -->
<plugins>
<!-- clean lifecycle, see https://maven.apache.org/ref/current/maven-core/lifecycles.html#clean_Lifecycle -->
<plugin>
<artifactId>maven-clean-plugin</artifactId>
<version>3.1.0</version>
</plugin>
<!-- default lifecycle, jar packaging: see https://maven.apache.org/ref/current/maven-core/default-bindings.html#Plugin_bindings_for_jar_packaging -->
<plugin>
<artifactId>maven-resources-plugin</artifactId>
<version>3.0.2</version>
</plugin>
<plugin>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.8.0</version>
</plugin>
<plugin>
<artifactId>maven-surefire-plugin</artifactId>
<version>2.22.1</version>
</plugin>
<plugin>
<artifactId>maven-jar-plugin</artifactId>
<version>3.0.2</version>
</plugin>
<plugin>
<artifactId>maven-install-plugin</artifactId>
<version>2.5.2</version>
</plugin>
<plugin>
<artifactId>maven-deploy-plugin</artifactId>
<version>2.8.2</version>
</plugin>
<!-- site lifecycle, see https://maven.apache.org/ref/current/maven-core/lifecycles.html#site_Lifecycle -->
<plugin>
<artifactId>maven-site-plugin</artifactId>
<version>3.7.1</version>
</plugin>
<plugin>
<artifactId>maven-project-info-reports-plugin</artifactId>
<version>3.0.0</version>
</plugin>
</plugins>
</pluginManagement>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<configuration>
<source>8</source>
<target>8</target>
</configuration>
</plugin>
</plugins>
</build>
二、main下配置log4j
# Set everything to be logged to the console
log4j.rootCategory=ERROR, console
log4j.appender.console=org.apache.log4j.ConsoleAppender
log4j.appender.console.target=System.err
log4j.appender.console.layout=org.apache.log4j.PatternLayout
log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n
# Set the default spark-shell log level to WARN. When running the spark-shell, the
# log level for this class is used to overwrite the root logger's log level, so that
# the user can have different defaults for the shell and regular Spark apps.
log4j.logger.org.apache.spark.repl.Main=WARN
# Settings to quiet third party logs that are too verbose
log4j.logger.org.spark_project.jetty=WARN
log4j.logger.org.spark_project.jetty.util.component.AbstractLifeCycle=ERROR
log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=INFO
log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=INFO
# SPARK-9183: Settings to avoid annoying messages when looking up nonexistent UDFs in SparkSQL with Hive support
log4j.logger.org.apache.hadoop.hive.metastore.RetryingHMSHandler=FATAL
log4j.logger.org.apache.hadoop.hive.ql.exec.FunctionRegistry=ERROR
三、javaSpark清洗数据(一)
1.代码编写
import org.apache.kafka.clients.consumer.ConsumerConfig;
import org.apache.kafka.common.serialization.Serdes;
import org.apache.kafka.streams.*;
import java.util.ArrayList;
import java.util.Properties;
/**
* 清洗前:
* user ,friends
* 3197468391,1346449342 3873244116 4226080662 1222907620
*
* 清洗后:
* user ,friends 目标:topic:user_friends_raw
* 3197468391,1346449342
* 3197468391,3873244116
* 3197468391,4226080662
* 3197468391,1222907620
*/
public class UserFriendStream {
public static void main(String[] args) {
Properties prop = new Properties();
prop.put(StreamsConfig.APPLICATION_ID_CONFIG, "userfriends1");
prop.put(StreamsConfig.BOOTSTRAP_SERVERS_CONFIG, "lxm147:9092");
prop.put(ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG, "false");// false 手动提交 true 自动提交(需要频繁更换消费者)
prop.put(StreamsConfig.COMMIT_INTERVAL_MS_CONFIG, 1000);
// earliest latest none
prop.put(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG, "earliest");
prop.put(StreamsConfig.DEFAULT_KEY_SERDE_CLASS_CONFIG, Serdes.String().getClass());
prop.put(StreamsConfig.DEFAULT_VALUE_SERDE_CLASS_CONFIG, Serdes.String().getClass());
StreamsBuilder builder = new StreamsBuilder();
// TODO 主要的业务逻辑处理 start
builder.stream("user_friends_raw")
.flatMap((key, value) -> {// value = 3197468391,1346449342 3873244116 4226080662 1222907620
ArrayList<KeyValue<String, String>> list = new ArrayList<>();
String[] fields = value.toString().split(",");
if (fields.length == 2) {
String userID = fields[0];
String[] friends = fields[1].split("\\s+");
for (String friendID :
friends) {
System.out.println(userID + "\t" + friendID);//3197468391,1346449342
KeyValue<String, String> kv = new KeyValue<>(null, userID + "," + friendID);
list.add(kv);
}
}
return list;
})
.to("user_friends");
// TODO 主要的业务逻辑处理 end
Topology topo = builder.build();
KafkaStreams streams = new KafkaStreams(topo, prop);
streams.start();
}
}
2.查看对应kafka的topic——user_friends
[root@lxm147 config]# kafka-run-class.sh kafka.tools.GetOffsetShell --broker-list lxm147:9092 --topic user_friends
user_friends:0:30386403
四、javaSpark清洗数据(二)
1.代码编写
import org.apache.kafka.clients.consumer.ConsumerConfig;
import org.apache.kafka.common.serialization.Serdes;
import org.apache.kafka.streams.*;
import java.util.ArrayList;
import java.util.Properties;
/**
* 清洗前:
* event , yes , maybe ,invited ,no
* 1159822043,1975964455 3973364512,2733420590 1350834692 1324909047,1723091036 3795873583,3575574655 1077296663
*
* event,friendid,status
* 1159822043,1975964455,yes
* 1159822043,3973364512,yes
* 1159822043,2733420590,maybe
* 1159822043,1350834692,maybe
* 1159822043,1723091036,invited
* 1159822043,3575574655,no
*/
public class EventAttendStream {
public static void main(String[] args) {
Properties prop = new Properties();
prop.put(StreamsConfig.APPLICATION_ID_CONFIG, "userfriends");
prop.put(StreamsConfig.BOOTSTRAP_SERVERS_CONFIG, "lxm147:9092");
prop.put(ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG, "false");// false 手动提交 true 自动提交(需要频繁更换消费者)
prop.put(StreamsConfig.COMMIT_INTERVAL_MS_CONFIG, 1000);
// earliest latest none
prop.put(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG, "earliest");
prop.put(StreamsConfig.DEFAULT_KEY_SERDE_CLASS_CONFIG, Serdes.String().getClass());
prop.put(StreamsConfig.DEFAULT_VALUE_SERDE_CLASS_CONFIG, Serdes.String().getClass());
StreamsBuilder builder = new StreamsBuilder();
// TODO 业务逻辑处理 start
builder.stream("event_attendees_raw")
.flatMap((key, value) -> {
ArrayList<KeyValue<String, String>> list = new ArrayList<>();
String[] fields = value.toString().split(",");
String eventID = fields[0];
if (eventID.trim().length() > 0) {
if(fields.length>=2){
String[] yesID = fields[1].split("\\s+");
for (String y :
yesID) {
System.out.println(eventID+"\t"+y+" yes");
list.add(new KeyValue<>(null,eventID+","+y+",yes"));
}
}if(fields.length>=3){
String[] maybeID = fields[2].split("\\s+");
for (String y :
maybeID) {
System.out.println(eventID+"\t"+y+" maybeID");
list.add(new KeyValue<>(null,eventID+","+y+",maybeID"));
}
}if(fields.length>=4){
String[] invitedID = fields[3].split("\\s+");
for (String y :
invitedID) {
System.out.println(eventID+"\t"+y+" invitedID");
list.add(new KeyValue<>(null,eventID+","+y+",invitedID"));
}
}if(fields.length>=5){// 判断no的位置是否有值
String[] noID = fields[4].split("\\s+");
for (String y :
noID) {
System.out.println(eventID+"\t"+y+" noID");
list.add(new KeyValue<>(null,eventID+","+y+",noID"));
}
}
}
return list;
})
.to("event_attendees");
// TODO 主要的业务逻辑处理 end
Topology topo = builder.build();
KafkaStreams streams = new KafkaStreams(topo, prop);
streams.start();
}
}
2.查看对应kafka的topic——event_attendees
[root@lxm147 config]# kafka-run-class.sh kafka.tools.GetOffsetShell --broker-list lxm147:9092 --topic event_attendees
event_attendees:0:11247092