1.从MongoDB读取
package com.mongodb.spark;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.SparkSession;
import org.bson.Document;
import com.mongodb.spark.MongoSpark;
import com.mongodb.spark.rdd.api.java.JavaMongoRDD;
public final class ReadFromMongoDB {
public static void main(final String[] args) throws InterruptedException {
SparkSession spark = SparkSession.builder().master("local").appName("MongoSparkConnectorIntro")
.config("spark.mongodb.input.uri", "mongodb://172.28.34.xxx:27117/wangzs.zhaopin")
.config("spark.mongodb.output.uri", "mongodb://172.28.34.xxx:27117/wangzs.sparkmongo").getOrCreate();
// Create a JavaSparkContext using the SparkSession's SparkContext object
JavaSparkContext jsc = new JavaSparkContext(spark.sparkContext());
/* Start Example: Read data from MongoDB ************************/
JavaMongoRDD<Document> rdd = MongoSpark.load(jsc);
/* End Example **************************************************/
// Analyze data from MongoDB
System.out.println(rdd.count());
System.out.println(rdd.first().toJson());
jsc.close();
}
}
2.写入MongoDB
package com.mongodb.spark;
import com.mongodb.spark.MongoSpark;
import com.mongodb.spark.config.WriteConfig;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.sql.SparkSession;
import org.bson.Document;
import static java.util.Arrays.asList;
import java.util.HashMap;
import java.util.Map;
public final class WriteToMongoDBWriteConfig {
public static void main(final String[] args) throws InterruptedException {
SparkSession spark = SparkSession.builder().master("local").appName("MongoSparkConnectorIntro")
.config("spark.mongodb.input.uri", "mongodb://172.28.34.xxx:27117/wangzs.zhaopin")
.config("spark.mongodb.output.uri", "mongodb://172.28.34.xxx:27117/wangzs.sparkmongo").getOrCreate();
JavaSparkContext jsc = new JavaSparkContext(spark.sparkContext());
// Create a custom WriteConfig
Map<String, String> writeOverrides = new HashMap<String, String>();
writeOverrides.put("collection", "spark");
writeOverrides.put("writeConcern.w", "majority");
WriteConfig writeConfig = WriteConfig.create(jsc).withOptions(writeOverrides);
// Create a RDD of 10 documents
JavaRDD<Document> sparkDocuments = jsc.parallelize(asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10))
.map(new Function<Integer, Document>() {
public Document call(final Integer i) throws Exception {
return Document.parse("{spark: " + i + ",name:" + i + "}");
}
});
/* Start Example: Save data from RDD to MongoDB *****************/
MongoSpark.save(sparkDocuments, writeConfig);
/* End Example **************************************************/
jsc.close();
}
}
3.聚合
package com.mongodb.spark;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.SparkSession;
import org.bson.Document;
import com.mongodb.spark.MongoSpark;
import com.mongodb.spark.rdd.api.java.JavaMongoRDD;
import static java.util.Collections.singletonList;
public final class Aggregation {
public static void main(final String[] args) throws InterruptedException {
SparkSession spark = SparkSession.builder().master("local").appName("Aggregation")
.config("spark.mongodb.input.uri", "mongodb://172.28.34.xxx:27117/wangzs.zhaopin")
.config("spark.mongodb.output.uri", "mongodb://172.28.34.xxx:27117/wangzs.sparkmongo").getOrCreate();
// Create a JavaSparkContext using the SparkSession's SparkContext object
JavaSparkContext jsc = new JavaSparkContext(spark.sparkContext());
// Load and analyze data from MongoDB
JavaMongoRDD<Document> rdd = MongoSpark.load(jsc);
/* Start Example: Use aggregation to filter a RDD ***************/
JavaMongoRDD<Document> aggregatedRdd = rdd
.withPipeline(singletonList(Document.parse("{ $match: { 'gzdd' : '上海-普陀区' } }")));
/* End Example **************************************************/
// Analyze data from MongoDB
System.out.println(aggregatedRdd.count());
System.out.println(aggregatedRdd.collect());
jsc.close();
}
}
4.数据集和SQL
/* 1 */
{
"_id" : ObjectId("5ae911d3460fcf70c940ac96"),
"name" : "Bilbo Baggins",
"age" : 50.0
}
/* 2 */
{
"_id" : ObjectId("5ae911d3460fcf70c940ac97"),
"name" : "Gandalf",
"age" : 1000.0
}
/* 3 */
{
"_id" : ObjectId("5ae911d3460fcf70c940ac98"),
"name" : "Thorin",
"age" : 195.0
}
/* 4 */
{
"_id" : ObjectId("5ae911d3460fcf70c940ac99"),
"name" : "Balin",
"age" : 178.0
}
/* 5 */
{
"_id" : ObjectId("5ae911d3460fcf70c940ac9a"),
"name" : "Kíli",
"age" : 77.0
}
/* 6 */
{
"_id" : ObjectId("5ae911d3460fcf70c940ac9b"),
"name" : "Dwalin",
"age" : 169.0
}
/* 7 */
{
"_id" : ObjectId("5ae911d3460fcf70c940ac9c"),
"name" : "Óin",
"age" : 167.0
}
/* 8 */
{
"_id" : ObjectId("5ae911d3460fcf70c940ac9d"),
"name" : "Glóin",
"age" : 158.0
}
/* 9 */
{
"_id" : ObjectId("5ae911d3460fcf70c940ac9e"),
"name" : "Fíli",
"age" : 82.0
}
/* 10 */
{
"_id" : ObjectId("5ae911d3460fcf70c940ac9f"),
"name" : "Bombur"
}
package com.mongodb.spark;
import java.io.Serializable;
public final class Character implements Serializable {
private String name;
private Integer age;
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
public Integer getAge() {
return age;
}
public void setAge(final Integer age) {
this.age = age;
}
}
package com.mongodb.spark;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;
public final class DatasetSQLDemo {
public static void main(final String[] args) throws InterruptedException {
SparkSession spark = SparkSession.builder().master("local").appName("Aggregation")
.config("spark.mongodb.input.uri", "mongodb://172.28.34.xxx:27117/wangzs.sparktest")
.config("spark.mongodb.output.uri", "mongodb://172.28.34.xxx:27117/wangzs.sparkmongo").getOrCreate();
// Create a JavaSparkContext using the SparkSession's SparkContext object
JavaSparkContext jsc = new JavaSparkContext(spark.sparkContext());
// Load data with explicit schema
Dataset<Character> explicitDS = MongoSpark.load(jsc).toDS(Character.class);
explicitDS.printSchema();
explicitDS.show();
// Create the temp view and execute the query
explicitDS.createOrReplaceTempView("characters");
Dataset<Row> centenarians = spark.sql("SELECT name, age FROM characters WHERE age >= 100");
centenarians.show();
// Write the data to the "hundredClub" collection
MongoSpark.write(centenarians).option("collection", "hundredClub").mode("overwrite").save();
jsc.close();
}
}
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.wangzs</groupId>
<artifactId>spark-2.1.0-learn</artifactId>
<version>0.0.1-SNAPSHOT</version>
<packaging>jar</packaging>
<name>spark-2.1.0-learn</name>
<url>http://maven.apache.org</url>
<dependencies>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.12</version>
<scope>test</scope>
</dependency>
<dependency> <!-- Spark dependency -->
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
<version>2.1.0</version>
</dependency>
<dependency>
<groupId>org.mongodb.spark</groupId>
<artifactId>mongo-spark-connector_2.11</artifactId>
<version>2.1.0</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.11</artifactId>
<version>2.1.0</version>
</dependency>
</dependencies>
<build>
<pluginManagement>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<configuration>
<source>1.8</source>
<target>1.8</target>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-resources-plugin</artifactId>
<configuration>
<encoding>UTF-8</encoding>
</configuration>
</plugin>
<!-- 打包时跳过测试 -->
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-surefire-plugin</artifactId>
<configuration>
<skipTests>true</skipTests>
</configuration>
</plugin>
</plugins>
</pluginManagement>
</build>
</project>
5.pom文件
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.wangzs</groupId>
<artifactId>spark-2.1.0-learn</artifactId>
<version>0.0.1-SNAPSHOT</version>
<packaging>jar</packaging>
<name>spark-2.1.0-learn</name>
<url>http://maven.apache.org</url>
<dependencies>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.12</version>
<scope>test</scope>
</dependency>
<dependency> <!-- Spark dependency -->
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
<version>2.1.0</version>
</dependency>
<dependency>
<groupId>org.mongodb.spark</groupId>
<artifactId>mongo-spark-connector_2.11</artifactId>
<version>2.1.0</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.11</artifactId>
<version>2.1.0</version>
</dependency>
</dependencies>
<build>
<pluginManagement>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<configuration>
<source>1.8</source>
<target>1.8</target>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-resources-plugin</artifactId>
<configuration>
<encoding>UTF-8</encoding>
</configuration>
</plugin>
<!-- 打包时跳过测试 -->
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-surefire-plugin</artifactId>
<configuration>
<skipTests>true</skipTests>
</configuration>
</plugin>
</plugins>
</pluginManagement>
</build>
</project>