通过env.execute();启动任务。env有四个实现类。LocalStreamEnvironment,是在本地jvm使用多线程模拟一个flink集群。四个实现类都是通过getStreamGraph来获取DAG图对象StreamGraph。
StreamGraph,保存着点边关系,和需要的上下文对象。可以重新api编程的语义。持有一个JobGraph对象,用来提交给集群。由Client,JobClient,JobClientMessages来执行提交操作,通过akka的Patterns来通信启动任务 。
datastream最后都是通过transform接口进行转换,形成transformation和operator。transform会通过getExecutionEnvironment().addOperator(resultTransform);将结果慢慢的反馈给env的List<StreamTransformation<?>> transformations。
StreamGraphGenerator.generate(this, transformations);最后由StreamGraphGenerator,根据env和transformations来生成StreamGraph这个DAG对象。通过transform
将env中的transformation慢慢的绘制StreamGraph的DAG图。
StreamGraph是代码的执行逻辑,持有的jobgraph才是task运行依据的DAG。通过createJobGraph将逻辑计划转化成任务计划。
结论:execute,就是将编程的逻辑,转换成一个streamGraph对象。然后将转换成jobgraph提交给集群。
/**
* Creates an execution environment that represents the context in which the
* program is currently executed. If the program is invoked standalone, this
* method returns a local execution environment, as returned by
* {@link #createLocalEnvironment()}.
*
* @return The execution environment of the context in which the program is
* executed.
*/
public static StreamExecutionEnvironment getExecutionEnvironment() {
if (contextEnvironmentFactory != null) {
return contextEnvironmentFactory.createExecutionEnvironment();
}
// because the streaming project depends on "flink-clients" (and not the other way around)
// we currently need to intercept the data set environment and create a dependent stream env.
// this should be fixed once we rework the project dependencies
ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
if (env instanceof ContextEnvironment) {
return new StreamContextEnvironment((ContextEnvironment) env);
} else if (env instanceof OptimizerPlanEnvironment | env instanceof PreviewPlanEnvironment) {
return new StreamPlanEnvironment(env);
} else {
return createLocalEnvironment();
}
}
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.flink.api.java;
import com.esotericsoftware.kryo.Serializer;
import org.apache.flink.annotation.PublicEvolving;
import org.apache.flink.annotation.Internal;
import org.apache.flink.annotation.Public;
import org.apache.flink.api.common.ExecutionConfig;
import org.apache.flink.api.common.InvalidProgramException;
import org.apache.flink.api.common.JobExecutionResult;
import org.apache.flink.api.common.JobID;
import org.apache.flink.api.common.Plan;
import org.apache.flink.api.common.cache.DistributedCache.DistributedCacheEntry;
import org.apache.flink.api.common.io.FileInputFormat;
import org.apache.flink.api.common.io.InputFormat;
import org.apache.flink.api.common.operators.OperatorInformation;
import org.apache.flink.api.common.restartstrategy.RestartStrategies;
import org.apache.flink.api.common.typeinfo.BasicTypeInfo;
import org.apache.flink.api.common.typeinfo.TypeInformation;
import org.apache.flink.api.java.hadoop.mapred.HadoopInputFormat;
import org.apache.flink.api.java.io.CollectionInputFormat;
import org.apache.flink.api.java.io.CsvReader;
import org.apache.flink.api.java.io.IteratorInputFormat;
import org.apache.flink.api.java.io.ParallelIteratorInputFormat;
import org.apache.flink.api.java.io.PrimitiveInputFormat;
import org.apache.flink.api.java.io.TextInputFormat;
import org.apache.flink.api.java.io.TextValueInputFormat;
import org.apache.flink.api.java.operators.DataSink;
import org.apache.flink.api.java.operators.DataSource;
import org.apache.flink.api.java.operators.Operator;
import org.apache.flink.api.java.operators.OperatorTranslation;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.api.java.typeutils.PojoTypeInfo;
import org.apache.flink.api.java.typeutils.ResultTypeQueryable;
import org.apache.flink.api.java.typeutils.TypeExtractor;
import org.apache.flink.api.java.typeutils.ValueTypeInfo;
import org.apache.flink.api.java.typeutils.runtime.kryo.Serializers;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.core.fs.Path;
import org.apache.flink.types.StringValue;
import org.apache.flink.util.NumberSequenceIterator;
import org.apache.flink.util.Preconditions;
import org.apache.flink.util.SplittableIterator;
import org.apache.flink.util.Visitor;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapreduce.Job;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Calendar;
import java.util.Collection;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
/**
* The ExecutionEnvironment is the context in which a program is executed. A
* {@link LocalEnvironment} will cause execution in the current JVM, a
* {@link RemoteEnvironment} will cause execution on a remote setup.
* <p>
* The environment provides methods to control the job execution (such as setting the parallelism)
* and to interact with the outside world (data access).
* <p>
* Please note that the execution environment needs strong type information for the input and return types
* of all operations that are executed. This means that the environments needs to know that the return
* value of an operation is for example a Tuple of String and Integer.
* Because the Java compiler throws much of the generic type information away, most methods attempt to re-
* obtain that information using reflection. In certain cases, it may be necessary to manually supply that
* information to some of the methods.
*
* @see LocalEnvironment
* @see RemoteEnvironment
*/
@Public
public abstract class ExecutionEnvironment {
/** The logger used by the environment and its subclasses */
protected static final Logger LOG = LoggerFactory.getLogger(ExecutionEnvironment.class);
/** The environment of the context (local by default, cluster if invoked through command line) */
private static ExecutionEnvironmentFactory contextEnvironmentFactory;
/** The default parallelism used by local environments */
private static int defaultLocalDop = Runtime.getRuntime().availableProcessors();
// --------------------------------------------------------------------------------------------
private final List<DataSink<?>> sinks = new ArrayList<>();
private final List<Tuple2<String, DistributedCacheEntry>> cacheFile = new ArrayList<>();
private final ExecutionConfig config = new ExecutionConfig();
/** Result from the latest execution, to make it retrievable when using eager execution methods */
protected JobExecutionResult lastJobExecutionResult;
/** The ID of the session, defined by this execution environment. Sessions and Jobs are same in
* Flink, as Jobs can consist of multiple parts that are attached to the growing dataflow graph */
protected JobID jobID;
/** The session timeout in seconds */
protected long sessionTimeout;
/** Flag to indicate whether sinks have been cleared in previous executions */
private boolean wasExecuted = false;
/**
* Creates a new Execution Environment.
*/
protected ExecutionEnvironment() {
jobID = JobID.generate();
}
// --------------------------------------------------------------------------------------------
// Properties
// --------------------------------------------------------------------------------------------
/**
* Gets the config object that defines execution parameters.
*
* @return The environment's execution configuration.
*/
public ExecutionConfig getConfig() {
return config;
}
/**
* Gets the parallelism with which operation are executed by default. Operations can
* individually override this value to use a specific parallelism via
* {@link Operator#setParallelism(int)}. Other operations may need to run with a different
* parallelism - for example calling
* {@link DataSet#reduce(org.apache.flink.api.common.functions.ReduceFunction)} over the entire
* set will insert eventually an operation that runs non-parallel (parallelism of one).
*
* @return The parallelism used by operations, unless they override that value. This method
* returns {@link ExecutionConfig#PARALLELISM_DEFAULT}, if the environment's default parallelism should be used.
*/
public int getParallelism() {
return config.getParallelism();
}
/**
* Sets the parallelism for operations executed through this environment.
* Setting a parallelism of x here will cause all operators (such as join, map, reduce) to run with
* x parallel instances.
* <p>
* This method overrides the default parallelism for this environment.
* The {@link LocalEnvironment} uses by default a value equal to the number of hardware
* contexts (CPU cores / threads). When executing the program via the command line client
* from a JAR file, the default parallelism is the one configured for that setup.
*
* @param parallelism The parallelism
*/
public void setParallelism(int parallelism) {
config.setParallelism(parallelism);
}
<<<<<<< HEAD
/**
* Sets the restart strategy configuration. The configuration specifies which restart strategy
* will be used for the execution graph in case of a restart.
*
* @param restartStrategyConfiguration Restart strategy configuration to be set
*/
@PublicEvolving
public void setRestartStrategy(RestartStrategies.RestartStrategyConfiguration restartStrategyConfiguration) {
config.setRestartStrategy(restartStrategyConfiguration);
}
/**
* Returns the specified restart strategy configuration.
*
* @return The restart strategy configuration to be used
*/
@PublicEvolving
public RestartStrategies.RestartStrategyConfiguration getRestartStrategy() {
return config.getRestartStrategy();
}
=======
>>>>>>> 644c27504ad6fb89372e3b39123a4f896013e1ad
/**
* Sets the number of times that failed tasks are re-executed. A value of zero
* effectively disables fault tolerance. A value of {@code -1} indicates that the system
* default value (as defined in the configuration) should be used.
*
* @param numberOfExecutionRetries The number of times the system will try to re-execute failed tasks.
*
* @deprecated This method will be replaced by {@link #setRestartStrategy}. The
* {@link RestartStrategies.FixedDelayRestartStrategyConfiguration} contains the number of
* execution retries.
*/
@Deprecated
@PublicEvolving
public void setNumberOfExecutionRetries(int numberOfExecutionRetries) {
config.setNumberOfExecutionRetries(numberOfExecutionRetries);
}
/**
* Gets the number of times the system will try to re-execute failed tasks. A value
* of {@code -1} indicates that the system default value (as defined in the configuration)
* should be used.
*
* @return The number of times the system will try to re-execute failed tasks.
*
* @deprecated This method will be replaced by {@link #getRestartStrategy}. The
* {@link RestartStrategies.FixedDelayRestartStrategyConfiguration} contains the number of
* execution retries.
*/
@Deprecated
@PublicEvolving
public int getNumberOfExecutionRetries() {
return config.getNumberOfExecutionRetries();
}
/**
* Returns the {@link org.apache.flink.api.common.JobExecutionResult} of the last executed job.
*
* @return The execution result from the latest job execution.
*/
public JobExecutionResult getLastJobExecutionResult(){
return this.lastJobExecutionResult;
}
// --------------------------------------------------------------------------------------------
// Session Management
// --------------------------------------------------------------------------------------------
/**
* Gets the JobID by which this environment is identified. The JobID sets the execution context
* in the cluster or local environment.
*
* @return The JobID of this environment.
* @see #getIdString()
*/
@PublicEvolving
public JobID getId() {
return this.jobID;
}
/**
* Gets the JobID by which this environment is identified, as a string.
*
* @return The JobID as a string.
* @see #getId()
*/
@PublicEvolving
public String getIdString() {
return this.jobID.toString();
}
/**
* Sets the session timeout to hold the intermediate results of a job. This only
* applies the updated timeout in future executions.
*
* @param timeout The timeout, in seconds.
*/
@PublicEvolving
public void setSessionTimeout(long timeout) {
throw new IllegalStateException("Support for sessions is currently disabled. " +
"It will be enabled in future Flink versions.");
// Session management is disabled, revert this commit to enable
//if (timeout < 0) {
// throw new IllegalArgumentException("The session timeout must not be less than zero.");
//}
//this.sessionTimeout = timeout;
}
/**
* Gets the session timeout for this environment. The session timeout defines for how long
* after an execution, the job and its intermediate results will be kept for future
* interactions.
*
* @return The session timeout, in seconds.
*/
@PublicEvolving
public long getSessionTimeout() {
return sessionTimeout;
}
/**
* Starts a new session, discarding the previous data flow and all of its intermediate results.
*/
@PublicEvolving
public abstract void startNewSession() throws Exception;
// --------------------------------------------------------------------------------------------
// Registry for types and serializers
// --------------------------------------------------------------------------------------------
/**
* Adds a new Kryo default serializer to the Runtime.
*
* Note that the serializer instance must be serializable (as defined by java.io.Serializable),
* because it may be distributed to the worker nodes by java serialization.
*
* @param type The class of the types serialized with the given serializer.
* @param serializer The serializer to use.
*/
public <T extends Serializer<?> & Serializable>void addDefaultKryoSerializer(Class<?> type, T serializer) {
config.addDefaultKryoSerializer(type, serializer);
}
/**
* Adds a new Kryo default serializer to the Runtime.
*
* @param type The class of the types serialized with the given serializer.
* @param serializerClass The class of the serializer to use.
*/
public void addDefaultKryoSerializer(Class<?> type, Class<? extends Serializer<?>> serializerClass) {
config.addDefaultKryoSerializer(type, serializerClass);
}
/**
* Registers the given type with a Kryo Serializer.
*
* Note that the serializer instance must be serializable (as defined by java.io.Serializable),
* because it may be distributed to the worker nodes by java serialization.
*
* @param type The class of the types serialized with the given serializer.
* @param serializer The serializer to use.
*/
public <T extends Serializer<?> & Serializable>void registerTypeWithKryoSerializer(Class<?> type, T serializer) {
config.registerTypeWithKryoSerializer(type, serializer);
}
/**
* Registers the given Serializer via its class as a serializer for the given type at the KryoSerializer
*
* @param type The class of the types serialized with the given serializer.
* @param serializerClass The class of the serializer to use.
*/
public void registerTypeWithKryoSerializer(Class<?> type, Class<? extends Serializer<?>> serializerClass) {
config.registerTypeWithKryoSerializer(type, serializerClass);
}
/**
* Registers the given type with the serialization stack. If the type is eventually
* serialized as a POJO, then the type is registered with the POJO serializer. If the
* type ends up being serialized with Kryo, then it will be registered at Kryo to make
* sure that only tags are written.
*
* @param type The class of the type to register.
*/
public void registerType(Class<?> type) {
if (type == null) {
throw new NullPointerException("Cannot register null type class.");
}
TypeInformation<?> typeInfo = TypeExtractor.createTypeInfo(type);
if (typeInfo instanceof PojoTypeInfo) {
config.registerPojoType(type);
} else {
config.registerKryoType(type);
}
}
// --------------------------------------------------------------------------------------------
// Data set creations
// --------------------------------------------------------------------------------------------
// ---------------------------------- Text Input Format ---------------------------------------
/**
* Creates a {@link DataSet} that represents the Strings produced by reading the given file line wise.
* The file will be read with the system's default character set.
*
* @param filePath The path of the file, as a URI (e.g., "file:///some/local/file" or "hdfs://host:port/file/path").
* @return A {@link DataSet} that represents the data read from the given file as text lines.
*/
public DataSource<String> readTextFile(String filePath) {
Preconditions.checkNotNull(filePath, "The file path may not be null.");
return new DataSource<>(this, new TextInputFormat(new Path(filePath)), BasicTypeInfo.STRING_TYPE_INFO, Utils.getCallLocationName());
}
/**
* Creates a {@link DataSet} that represents the Strings produced by reading the given file line wise.
* The {@link java.nio.charset.Charset} with the given name will be used to read the files.
*
* @param filePath The path of the file, as a URI (e.g., "file:///some/local/file" or "hdfs://host:port/file/path").
* @param charsetName The name of the character set used to read the file.
* @return A {@link DataSet} that represents the data read from the given file as text lines.
*/
public DataSource<String> readTextFile(String filePath, String charsetName) {
Preconditions.checkNotNull(filePath, "The file path may not be null.");
TextInputFormat format = new TextInputFormat(new Path(filePath));
format.setCharsetName(charsetName);
return new DataSource<>(this, format, BasicTypeInfo.STRING_TYPE_INFO, Utils.getCallLocationName());
}
// -------------------------- Text Input Format With String Value------------------------------
/**
* Creates a {@link DataSet} that represents the Strings produced by reading the given file line wise.
* This method is similar to {@link #readTextFile(String)}, but it produces a DataSet with mutable
* {@link StringValue} objects, rather than Java Strings. StringValues can be used to tune implementations
* to be less object and garbage collection heavy.
* <p>
* The file will be read with the system's default character set.
*
* @param filePath The path of the file, as a URI (e.g., "file:///some/local/file" or "hdfs://host:port/file/path").
* @return A {@link DataSet} that represents the data read from the given file as text lines.
*/
public DataSource<StringValue> readTextFileWithValue(String filePath) {
Preconditions.checkNotNull(filePath, "The file path may not be null.");
return new DataSource<>(this, new TextValueInputFormat(new Path(filePath)), new ValueTypeInfo<>(StringValue.class), Utils.getCallLocationName());
}
/**
* Creates a {@link DataSet} that represents the Strings produced by reading the given file line wise.
* This method is similar to {@link #readTextFile(String, String)}, but it produces a DataSet with mutable
* {@link StringValue} objects, rather than Java Strings. StringValues can be used to tune implementations
* to be less object and garbage collection heavy.
* <p>
* The {@link java.nio.charset.Charset} with the given name will be used to read the files.
*
* @param filePath The path of the file, as a URI (e.g., "file:///some/local/file" or "hdfs://host:port/file/path").
* @param charsetName The name of the character set used to read the file.
* @param skipInvalidLines A flag to indicate whether to skip lines that cannot be read with the given character set.
*
* @return A DataSet that represents the data read from the given file as text lines.
*/
public DataSource<StringValue> readTextFileWithValue(String filePath, String charsetName, boolean skipInvalidLines) {
Preconditions.checkNotNull(filePath, "The file path may not be null.");
TextValueInputFormat format = new TextValueInputFormat(new Path(filePath));
format.setCharsetName(charsetName);
format.setSkipInvalidLines(skipInvalidLines);
return new DataSource<>(this, format, new ValueTypeInfo<>(StringValue.class), Utils.getCallLocationName());
}
// ----------------------------------- Primitive Input Format ---------------------------------------
/**
* Creates a {@link DataSet} that represents the primitive type produced by reading the given file line wise.
* This method is similar to {@link #readCsvFile(String)} with single field, but it produces a DataSet not through
* {@link org.apache.flink.api.java.tuple.Tuple1}.
*
* @param filePath The path of the file, as a URI (e.g., "file:///some/local/file" or "hdfs://host:port/file/path").
* @param typeClass The primitive type class to be read.
* @return A {@link DataSet} that represents the data read from the given file as primitive type.
*/
public <X> DataSource<X> readFileOfPrimitives(String filePath, Class<X> typeClass) {
Preconditions.checkNotNull(filePath, "The file path may not be null.");
return new DataSource<>(this, new PrimitiveInputFormat<>(new Path(filePath), typeClass), TypeExtractor.getForClass(typeClass), Utils.getCallLocationName());
}
/**
* Creates a {@link DataSet} that represents the primitive type produced by reading the given file in delimited way.
* This method is similar to {@link #readCsvFile(String)} with single field, but it produces a DataSet not through
* {@link org.apache.flink.api.java.tuple.Tuple1}.
*
* @param filePath The path of the file, as a URI (e.g., "file:///some/local/file" or "hdfs://host:port/file/path").
* @param delimiter The delimiter of the given file.
* @param typeClass The primitive type class to be read.
* @return A {@link DataSet} that represents the data read from the given file as primitive type.
*/
public <X> DataSource<X> readFileOfPrimitives(String filePath, String delimiter, Class<X> typeClass) {
Preconditions.checkNotNull(filePath, "The file path may not be null.");
return new DataSource<>(this, new PrimitiveInputFormat<>(new Path(filePath), delimiter, typeClass), TypeExtractor.getForClass(typeClass), Utils.getCallLocationName());
}
// ----------------------------------- CSV Input Format ---------------------------------------
/**
* Creates a CSV reader to read a comma separated value (CSV) file. The reader has options to
* define parameters and field types and will eventually produce the DataSet that corresponds to
* the read and parsed CSV input.
*
* @param filePath The path of the CSV file.
* @return A CsvReader that can be used to configure the CSV input.
*/
public CsvReader readCsvFile(String filePath) {
return new CsvReader(filePath, this);
}
// ------------------------------------ File Input Format -----------------------------------------
public <X> DataSource<X> readFile(FileInputFormat<X> inputFormat, String filePath) {
if (inputFormat == null) {
throw new IllegalArgumentException("InputFormat must not be null.");
}
if (filePath == null) {
throw new IllegalArgumentException("The file path must not be null.");
}
inputFormat.setFilePath(new Path(filePath));
try {
return createInput(inputFormat, TypeExtractor.getInputFormatTypes(inputFormat));
}
catch (Exception e) {
throw new InvalidProgramException("The type returned by the input format could not be automatically determined. " +
"Please specify the TypeInformation of the produced type explicitly by using the " +
"'createInput(InputFormat, TypeInformation)' method instead.");
}
}
// ----------------------------------- Generic Input Format ---------------------------------------
/**
* Generic method to create an input {@link DataSet} with in {@link InputFormat}. The DataSet will not be
* immediately created - instead, this method returns a DataSet that will be lazily created from
* the input format once the program is executed.
* <p>
* Since all data sets need specific information about their types, this method needs to determine
* the type of the data produced by the input format. It will attempt to determine the data type
* by reflection, unless the input format implements the {@link ResultTypeQueryable} interface.
* In the latter case, this method will invoke the {@link ResultTypeQueryable#getProducedType()}
* method to determine data type produced by the input format.
*
* @param inputFormat The input format used to create the data set.
* @return A {@link DataSet} that represents the data created by the input format.
*
* @see #createInput(InputFormat, TypeInformation)
*/
public <X> DataSource<X> createInput(InputFormat<X, ?> inputFormat) {
if (inputFormat == null) {
throw new IllegalArgumentException("InputFormat must not be null.");
}
try {
return createInput(inputFormat, TypeExtractor.getInputFormatTypes(inputFormat));
}
catch (Exception e) {
throw new InvalidProgramException("The type returned by the input format could not be automatically determined. " +
"Please specify the TypeInformation of the produced type explicitly by using the " +
"'createInput(InputFormat, TypeInformation)' method instead.", e);
}
}
/**
* Generic method to create an input DataSet with in {@link InputFormat}. The {@link DataSet} will not be
* immediately created - instead, this method returns a {@link DataSet} that will be lazily created from
* the input format once the program is executed.
* <p>
* The {@link DataSet} is typed to the given TypeInformation. This method is intended for input formats that
* where the return type cannot be determined by reflection analysis, and that do not implement the
* {@link ResultTypeQueryable} interface.
*
* @param inputFormat The input format used to create the data set.
* @return A {@link DataSet} that represents the data created by the input format.
*
* @see #createInput(InputFormat)
*/
public <X> DataSource<X> createInput(InputFormat<X, ?> inputFormat, TypeInformation<X> producedType) {
if (inputFormat == null) {
throw new IllegalArgumentException("InputFormat must not be null.");
}
if (producedType == null) {
throw new IllegalArgumentException("Produced type information must not be null.");
}
return new DataSource<>(this, inputFormat, producedType, Utils.getCallLocationName());
}
// ----------------------------------- Hadoop Input Format ---------------------------------------
/**
* Creates a {@link DataSet} from the given {@link org.apache.hadoop.mapred.FileInputFormat}. The
* given inputName is set on the given job.
*/
@PublicEvolving
public <K,V> DataSource<Tuple2<K, V>> readHadoopFile(org.apache.hadoop.mapred.FileInputFormat<K,V> mapredInputFormat, Class<K> key, Class<V> value, String inputPath, JobConf job) {
DataSource<Tuple2<K, V>> result = createHadoopInput(mapredInputFormat, key, value, job);
org.apache.hadoop.mapred.FileInputFormat.addInputPath(job, new org.apache.hadoop.fs.Path(inputPath));
return result;
}
/**
* Creates a {@link DataSet} from {@link org.apache.hadoop.mapred.SequenceFileInputFormat}
* A {@link org.apache.hadoop.mapred.JobConf} with the given inputPath is created.
*/
<<<<<<< HEAD
@PublicEvolving
=======
>>>>>>> 644c27504ad6fb89372e3b39123a4f896013e1ad
public <K,V> DataSource<Tuple2<K, V>> readSequenceFile(Class<K> key, Class<V> value, String inputPath) throws IOException {
return readHadoopFile(new org.apache.hadoop.mapred.SequenceFileInputFormat<K, V>(), key, value, inputPath);
}
/**
* Creates a {@link DataSet} from the given {@link org.apache.hadoop.mapred.FileInputFormat}. A
* {@link org.apache.hadoop.mapred.JobConf} with the given inputPath is created.
*/
@PublicEvolving
public <K,V> DataSource<Tuple2<K, V>> readHadoopFile(org.apache.hadoop.mapred.FileInputFormat<K,V> mapredInputFormat, Class<K> key, Class<V> value, String inputPath) {
return readHadoopFile(mapredInputFormat, key, value, inputPath, new JobConf());
}
/**
* Creates a {@link DataSet} from the given {@link org.apache.hadoop.mapred.InputFormat}.
*/
@PublicEvolving
public <K,V> DataSource<Tuple2<K, V>> createHadoopInput(org.apache.hadoop.mapred.InputFormat<K,V> mapredInputFormat, Class<K> key, Class<V> value, JobConf job) {
HadoopInputFormat<K, V> hadoopInputFormat = new HadoopInputFormat<>(mapredInputFormat, key, value, job);
return this.createInput(hadoopInputFormat);
}
/**
* Creates a {@link DataSet} from the given {@link org.apache.hadoop.mapreduce.lib.input.FileInputFormat}. The
* given inputName is set on the given job.
*/
@PublicEvolving
public <K,V> DataSource<Tuple2<K, V>> readHadoopFile(org.apache.hadoop.mapreduce.lib.input.FileInputFormat<K,V> mapreduceInputFormat, Class<K> key, Class<V> value, String inputPath, Job job) throws IOException {
DataSource<Tuple2<K, V>> result = createHadoopInput(mapreduceInputFormat, key, value, job);
org.apache.hadoop.mapreduce.lib.input.FileInputFormat.addInputPath(job, new org.apache
.hadoop.fs.Path(inputPath));
return result;
}
/**
* Creates a {@link DataSet} from the given {@link org.apache.hadoop.mapreduce.lib.input.FileInputFormat}. A
* {@link org.apache.hadoop.mapreduce.Job} with the given inputPath is created.
*/
@PublicEvolving
pub