自定义Source读取OSS系统中文件
一、POM依赖引入
<properties>
<flink.version>1.9.1</flink.version>
<aliyun.oss.version>2.8.3</aliyun.oss.version>
</properties>
<dependencies>
<dependency>
<groupId>com.aliyun.oss</groupId>
<artifactId>aliyun-sdk-oss</artifactId>
<version>${aliyun.oss.version}</version>
</dependency>
<!-- flink -->
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-streaming-java_2.11</artifactId>
<version>${flink.version}</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-connector-kafka_2.11</artifactId>
<version>${flink.version}</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-connector-kafka-0.10_2.11</artifactId>
<version>${flink.version}</version>
</dependency>
<dependency>
<groupId>org.xerial.snappy</groupId>
<artifactId>snappy-java</artifactId>
<version>1.1.7.2</version>
<type>jar</type>
<scope>compile</scope>
</dependency>
</dependencies>
二、自定义OSS Source
1.继承 RichParallelSourceFunction
RichParallelSourceFunction 可以改变source并行度,一般的Source默认并行度是 1 且无法修改
代码示例:
public class OssSource extends RichParallelSourceFunction<String>{}
2.实现方法
open方法示例:
@Override
public void open(Configuration parameters) throws Exception {
/**
* 测试用:读取本地配置文件
*/
Properties prop = PropertiesUtils.loadProp("config.properties");
endpoint = prop.getProperty("endpoint");
accessKeyId = prop.getProperty("accessKeyId");
accessKeySecret = prop.getProperty("accessKeySecret");
bucketName = prop.getProperty("bucketName");
/**
* 根据系统中配置文件读取
*/
/*ParameterTool prop = PropertiesUtils.createParameterTool();
endpoint = prop.get("endpoint");
accessKeyId = prop.get("accessKeyId");
accessKeySecret = prop.get("accessKeySecret");
bucketName = prop.get("bucketName");*/
ossClient = new OSSClient(endpoint,accessKeyId,accessKeySecret);
super.open(parameters);
}
run方法示例:
@Override
public void run(SourceContext<String> ctx) throws Exception {
ObjectListing objectListing = null;
String prefix = hour;
String nextMarker = null;
do{
//"sy-online-fdr-v2/2019/11/28"
objectListing = ossClient.listObjects(new ListObjectsRequest(bucketName).withPrefix("sy-online-fdr-v2/" + ptt_day ).withMarker(nextMarker));
List<OSSObjectSummary> objectSummaries = objectListing.getObjectSummaries();
for (OSSObjectSummary objectSummary : objectSummaries) {
String key = objectSummary.getKey();
String keyPrefix = key.split("/")[4].split("_")[0];
if(prefix.equals(keyPrefix)){
OSSObject object = ossClient.getObject(bucketName, key);
InputStream objectContent = object.getObjectContent();
readSnappy(objectContent,ctx);
}
}
//更改每次的列举文件的起点
nextMarker = objectListing.getNextMarker();
}while(objectListing.isTruncated());
}
cancel 和 close方法示例:
@Override
public void cancel() {
ossClient.shutdown();
}
@Override
public void close() throws Exception {
ossClient.shutdown();
}
3.流式读取Snappy文件
/**
* 读取Snappy并进行解压
*/
public static void readSnappy(InputStream input,SourceContext<String> ctx) throws IOException {
BufferedReader br = new BufferedReader(new InputStreamReader(new SnappyInputStream(input)));
try {
while (true) {
String line = br.readLine();
if (line == null)
break;
ctx.collect(line);
}
} catch (Throwable ex) {
ex.printStackTrace();
} finally {
if (input != null) {
try {
input.close();
} catch (Exception x) {
}
}
}
}
读取snappy可以参考 https://github.com/xerial/snappy-java
4.使用自定义Source
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
DataStreamSource<String> inputStream = env.addSource(new OssSource(para,hour)).setParallelism(2);