共两种格式
1.行格式 直接将数据按行编码写入文件中
2.列格式 将一批数据安列格式写入文件,比较重要的有写入parquet文件
public class Demo2 {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
DataStreamSource<EventLog> stream = env.addSource(new MySourceFunction());
/**
* 1.将数据输出为行格式
* 1.1将数据转成JSON格式
* 1.2输出
*/
SingleOutputStreamOperator<String> jsoned = stream.map(JSON::toJSONString);
FileSink<String> rowSink = FileSink.forRowFormat(new Path("d://123"), new SimpleStringEncoder<String>()).build();
jsoned.sinkTo(rowSink);
/**
* 2.将数据输出为列格式
* 2.1 自己写schema
* 2.2 使用avro的avsc文件自动生成一个工厂类
* 2.3 直接使用自己写的Bean映射成一个工厂类
*/
// ParquetAvroWriters.forGenericRecord()
ParquetWriterFactory<AvroEventLog> factory1 = ParquetAvroWriters.forSpecificRecord(AvroEventLog.class);
FileSink<AvroEventLog> fileSink = FileSink.forBulkFormat(new Path("d://123"), factory1)
.withBucketCheckInterval(10000)
.withBucketAssigner(new DateTimeBucketAssigner<>())
.withRollingPolicy(OnCheckpointRollingPolicy.build())
.build();
SingleOutputStreamOperator<AvroEventLog> outStream = stream.map(bean -> {
HashMap<CharSequence, CharSequence> mp = new HashMap<>();
for (Map.Entry<String, String> entry : bean.getEventInfo().entrySet()) {
mp.put(entry.getKey(), entry.getValue());
}
return new AvroEventLog(bean.getGuid(), bean.getSessionId(), bean.getEventId(), bean.getTimeStamp(), mp);
});
outStream.sinkTo(fileSink);
//2.3直接将自己的bean映射成一个工厂类
ParquetWriterFactory<EventLog> factory2 = ParquetAvroWriters.forReflectRecord(EventLog.class);
FileSink<EventLog> fileSink1 = FileSink.forBulkFormat(new Path("d://234"), factory2)
.build();
env.execute();
}