创建工程
scala maven
添加依赖
//版本号
<scala.version>2.12.10</scala.version>
<spark.version>3.0.1</spark.version>
<spark.scala.version>2.12</spark.scala.version>
<!-- scala-->
<dependency>
<groupId>org.scala-lang</groupId>
<artifactId>scala-library</artifactId>
<version>${scala.version}</version>
</dependency>
<dependency>
<groupId>org.specs</groupId>
<artifactId>specs</artifactId>
<version>1.2.5</version>
<scope>test</scope>
</dependency>
<!-- spark core-->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_${spark.scala.version}</artifactId>
<version>${spark.version}</version>
</dependency>
<!-- spark sql-->
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-lang3</artifactId>
<version>3.9</version>
</dependency>
<dependency>
<groupId>commons-io</groupId>
<artifactId>commons-io</artifactId>
<version>2.4</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.12</artifactId>
<version>3.1.2</version>
</dependency>
<dependency>
<groupId>org.jeecgframework.nacos</groupId>
<artifactId>nacos-client</artifactId>
<version>1.4.1</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.12</artifactId>
<version>3.1.2</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-hive_2.12</artifactId>
<version>3.1.2</version>
</dependency>
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>5.1.48</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-graphx_2.12</artifactId>
<version>3.1.2</version>
</dependency>
<!-- spark stream-->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming_2.12</artifactId>
<version>${spark.version}</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming-kafka-0-10_2.12</artifactId>
<version>${spark.version}</version>
</dependency>
<!-- web socket-->
<dependency>
<groupId>org.eclipse.jetty.websocket</groupId>
<artifactId>websocket-servlet</artifactId>
<version>9.4.35.v20201120</version>
</dependency>
<!-- spark stream-->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming_2.12</artifactId>
<version>3.0.1</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming-kafka-0-10_2.12</artifactId>
<version>3.0.1</version>
</dependency>
创建resource
sore-site.xml、hdfs-site.xml、hive-site.xml 到虚拟机的hadoop和hive里找。
log4j.properties如下:
//不显示日志,则error,显示则改成INFO
log4j.rootLogger=ERROR, stdout
log4j.appender.stdout=org.apache.log4j.ConsoleAppender
log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
log4j.appender.stdout.layout.ConversionPattern=%d %p [%c] - %m%n
log4j.appender.logfile=org.apache.log4j.FileAppender
log4j.appender.logfile.File=target/hadoop.log
log4j.appender.logfile.layout=org.apache.log4j.PatternLayout
log4j.appender.logfile.layout.ConversionPattern=%d %p [%c] - %m%n
创建两个类
producer(java)
public static void main(String[] args) throws IOException, ExecutionException, InterruptedException {
Properties config = new Properties();
//连接
config.setProperty("bootstrap.servers","192.168.6.130:9092");
//容错
config.setProperty("retries","2");
config.setProperty("acks","-1");
//批处理:两个条件先满足谁,都会推送消息
config.setProperty("batch.size","128");
config.setProperty("linger.ms","500");
//消息键值的序列化
config.setProperty("key.serializer","org.apache.kafka.common.serialization.LongSerializer");
config.setProperty("value.serializer","org.apache.kafka.common.serialization.StringSerializer");
KafkaProducer<Long,String> producer = new KafkaProducer<Long, String>(config);
BufferedReader reader = new BufferedReader(new InputStreamReader(System.in));
long count = 0;
final int PARTITION = 0;
final String TOPIC = "kb12_01";
while (true){
String input = reader.readLine();
if (input.equalsIgnoreCase("exit")) {
break;
}
ProducerRecord<Long,String> record = new ProducerRecord<Long, String>(TOPIC,PARTITION,++count,input);
RecordMetadata rmd = producer.send(record).get();
System.out.println(rmd.topic()+"\t"+rmd.partition()+"\t"+rmd.offset()+"\t"+count+":"+input);
}
reader.close();
producer.close();
}
Consumer(scala)
def main(args: Array[String]): Unit = {
val config: Properties = new Properties()
config.setProperty("bootstrap.servers", "192.168.6.130:9092");
config.setProperty("key.deserializer", "org.apache.kafka.common.serialization.LongDeserializer");
config.setProperty("value.deserializer", "org.apache.kafka.common.serialization.StringDeserializer");
config.setProperty("group.id", "kafka_kb12_01")
config.setProperty("enable.auto.commit", "true")
config.setProperty("auto.offset.reset", "earliest")
val topics = util.Arrays.asList("kb12_01")
val consumer: KafkaConsumer[Long, String] = new KafkaConsumer(config);
consumer.subscribe(topics)
try {
while (true) {
//阻塞读取,5秒读一次
consumer.poll(Duration.ofSeconds(5)).forEach(e => {
println(s"${e.key()}\t${e.value()}")
})
}
}finally {
consumer.close()
}
}
先生产数据,再消费数据。