import org.apache.spark.api.java.JavaRDD; //导入方法依赖的package包/类
@Override
public JavaRDD parseQuads(String path) {
Configuration conf = new Configuration();
Integer batchSize = config.getBatchSize();
conf.set(NLineInputFormat.LINES_PER_MAP, batchSize.toString());
if (config.getErrorHandling() == ParseErrorHandling.Throw) {
conf.set(RdfIOConstants.INPUT_IGNORE_BAD_TUPLES, "false");
} else {
conf.set(RdfIOConstants.INPUT_IGNORE_BAD_TUPLES, "true");
}
Boolean isLineBased = config.getLineBasedFormat();
if (isLineBased == null) {
isLineBased = guessIsLineBasedFormat(path);
}
JavaRDD quads;
Integer partitions = config.getRepartition();
if (isLineBased) {
log.info("Parsing RDF in parallel with batch size: {}", batchSize);
quads = sc.newAPIHadoopFile(path,
NQuadsInputFormat.class,
LongWritable.class, // position
QuadWritable.class, // value
conf).values().map(QuadWritable::get);
} else {
// let Jena guess the format, load whole files
log.info("Input format is not line based, parsing RDF by Master node only.");
quads = sc.newAPIHadoopFile(path,
TriplesOrQuadsInputFormat.class,
LongWritable.class, // position
QuadWritable.class, // value
conf).values().map(QuadWritable::get);
if (partitions == null) {
log.warn("Reading non-line based formats by master node only, consider setting --parsing.repartition to redistribute work to other nodes.");
}
}
if (partitions != null) {
log.info("Distributing workload, repartitioning into {} partitions", partitions);
quads = quads.repartition(partitions);
}
final List acceptedLanguages = config.getAcceptedLanguages();
// if only some languages are accepted
if (!acceptedLanguages.isEmpty()) {
// filter out literals of unsupported languages
quads = quads.filter(quad ->
!quad.getObject().isLiteral() ||
quad.getObject().getLiteralLanguage() == null ||
quad.getObject().getLiteralLanguage().isEmpty() ||
acceptedLanguages.contains(quad.getObject().getLiteralLanguage())
);
}
return quads;
}