首先要引入mongodb-spark-connector的maven依赖,具体的可见这个api网址:https://docs.mongodb.com/spark-connector/current/java-api/,然后基本上就可以按照api上面的内容来进行spark操作了。这里面已经有spark读入mongodb数据转化为rdd的操作了。
有一些补充的或许有用(?)的代码,放在这里。
import com.mongodb.MongoClient;
import com.mongodb.MongoClientURI;
import com.mongodb.client.MongoDatabase;
import com.mongodb.spark.MongoConnector;
import com.mongodb.spark.MongoSpark;
import com.mongodb.spark.config.ReadConfig;
import com.mongodb.spark.config.WriteConfig;
import com.mongodb.spark.rdd.api.java.JavaMongoRDD;
import com.mongodb.spark.sql.helpers.StructFields;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.StructType;
import org.bson.Document;
import org.bson.types.ObjectId;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import static java.lang.String.format;
import static java.util.Arrays.asList;
import static java.util.Collections.singletonList;
public final class JavaIntroduction {
/**
* Run this main method to see the output of this quick example.
*
* @param args takes an optional single argument for the connection string
* @throws InterruptedException if a latch is interrupted
*/
public s