Apache Flink 提供了一个分布式缓存,类似于Hadoop,用户可以并行获取数据。
通过注册一个文件或者文件夹到本地或者远程HDFS等,在getExecutionEnvironment中指定一个名字就可以。当应用程序执行时,Flink会自动拷贝这个文件或者文件夹到所有worker进程中。用户的Function通过指定的名字可以查找这个文件或者文件夹中的内容。
Scala
def main(args: Array[String]): Unit = {
val environment = ExecutionEnvironment.getExecutionEnvironment
val filePath = "E:/test/hello.txt"
// step1: 注册一个本地文件
environment.registerCachedFile(filePath, "pk-scala-dc")
val data = environment.fromElements("hadoop", "spark", "flink", "pyspark")
val info=data.map(new RichMapFunction[String, String] {
//step2: 在open方法中获取到分布式缓存的内容即可
override def open(parameters: Configuration): Unit = {
val dcfile = getRuntimeContext.getDistributedCache.getFile("pk-scala-dc")
val lines = FileUtils.readLines(dcfile)
import scala.collection.JavaConverters._
for(ele <- lines.asScala){
println(ele)
}
}
override def map(value: String): String = {
value
}
})
info.writeAsText("E:/test3", WriteMode.OVERWRITE).setParallelism(4)
environment.execute("DistributedCacheApp")
}
Java
public class JavaDistributedCachedApp {
public static void main(String[] args) throws Exception {
ExecutionEnvironment executionEnvironment = ExecutionEnvironment.getExecutionEnvironment();
executionEnvironment.registerCachedFile("E:/test/hello.txt", "pk-java-dc");
DataSource<String> data1 = executionEnvironment.fromElements("hadoop", "spark", "flink", "pyspark");
data1.map(new RichMapFunction<String, String>() {
List<String> list = new ArrayList<>();
@Override
public void open(Configuration parameters) throws Exception {
File file = getRuntimeContext().getDistributedCache().getFile("pk-java-dc");
List<String> lines = FileUtils.readLines(file);
for (String line : lines) {
list.add(line);
System.out.println(list);
}
}
@Override
public String map(String value) throws Exception {
return value;
}
}).print();
}
}