需求:使用spark消费kafka,并按日期写入hdfs。我这里使用的是Structured Streaming。
版本:spark 2.2.0,hadoop 3.0.0,scala 2.11,kafka_2.11-1.0.1
实现:
1)pom.xml
<dependencies>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_${scala.binary.version}</artifactId>
<version>${spark.version}</version>
<!-- <scope>provided</scope>-->
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql-kafka-0-10_${scala.binary.version}</artifactId>
<version>${spark.version}</version>
</dependency>
<dependency>
<groupId>org.apache.kafka</groupId>
<artifactId>kafka_2.11</artifactId>
<version>${kafka.version}</version>
<exclusions>
<exclusion>
<groupId>org.apache.kafka</groupId>
<artifactId>kafka-clients</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>${hadoop.version}</version>
<exclusions>
<exclusion>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-hdfs</artifactId>
<version>${hadoop.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-hdfs-client</artifactId>
<version>${hadoop.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-hdfs-httpfs</artifactId>
<version>${hadoop.version}</version>
</dependency>
</dependencies>
2)main
public class Main {
public static FileSystem fs = null;
public static void main(String[] args) throws StreamingQueryException {
SparkSession sparkSession = SparkSession.builder().appName("appName")
.master("local[*]")
.getOrCreate();
Dataset<Row> df = sparkSession
.readStream()
.format("kafka")
.option("kafka.bootstrap.servers", "localhost:9092")
.option("subscribe", "testTopic")//kafka topic
.option("startingOffsets", "latest")
.load()
.selectExpr("CAST(value AS STRING)");
// 使用Foreach sink,将数据写入指定文件,同一天的数据写入到一个文件中
fs = getHdfsFilesystem();
StreamingQuery fileSinkQuery = df
.writeStream()
.foreach(new HdfsSink())
.outputMode("append")
.option("checkpointLocation", "to/hdfs/checkpointPath")
.start();
fileSinkQuery.awaitTermination();
}
public static FileSystem getHdfsFilesystem(){
FileSystem fs = null;
Configuration conf = new Configuration();
conf.setBoolean("dfs.support.append", true);
conf.set("fs.defaultFS", "hdfs://localhost:8020");
try{
fs = FileSystem.get(conf);
}catch(IOException e){
e.printStackTrace();
}
return fs;
}
}
3)HdfsSink实现
public class HdfsSink extends ForeachWriter<Row>{
private String hdfsPath;//文件存储在hdfs上的真实地址
private FSDataOutputStream output = null;
private static final Logger logger = LoggerFactory.getLogger(HdfsSink.class);
private Path pathNew = null;
@Override
public boolean open(long partitionId, long version) {
//获取当前日期
SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd");
String currentDate = dateFormat.format( new Date());
hdfsPath = "hdfs://localhost:8020/spark/testWn/"+currentDate+"/part-"+partitionId;
pathNew = new Path(hdfsPath);
//不存在以当前日期命名的文件则创建;存在则先判断文件大小,超过设置(主要是避免产生过多小文件)则将该文件重命名,重命名成功后则重新创建以当前日期命名的文件
try{
if(Main.fs == null){
Main.fs = Main.getHdfsFilesystem();
}
if (!Main.fs.exists(pathNew)){
Main.fs.create(pathNew).close();
}else{
if(Main.fs.getFileStatus(this.pathNew).getLen() >= 120*1024*1024){
boolean isModify = Main.fs.rename(pathNew,new Path(this.hdfsPath+"-"+System.currentTimeMillis()));
if(isModify){
Main.fs.create(pathNew).close();
}
}
}
output=Main.fs.append(pathNew);
}catch(IOException e){
logger.error("HdfsSink>process>>>>>"+e.toString());
}
return true;
}
@Override
public void process(Row value) {
String log = value.getAs("value").toString();
if(log.trim().length()==0){
return;
}
try{
if(output==null){
output = Main.fs.append(pathNew);
}
output.write(log.getBytes("UTF-8"));
output.write("\n".getBytes("UTF-8"));//换行
}catch(IOException e){
logger.error("HdfsSink>process>>>>>"+e.toString());
}
}
@Override
public void close(Throwable errorOrNull) {
try{
if(output!=null){
output.close();
}
}catch(IOException e){
logger.error("HdfsSink>close>>>>>"+e.toString());
}
}
}
4)hdfs上文件存储效果展示