MongoDB on SparkSql的读取和写入操作(Scala版本)
1.1 添加依赖
需要添加一下依赖:
<!-- spark 连接 mongo的连接器 -->
<dependency>
<groupId>org.mongodb.spark</groupId>
<artifactId>mongo-spark-connector_2.11</artifactId>
<version>2.3.1</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
<version>2.3.1</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.11</artifactId>
<version>2.3.1</version>
</dependency>
1.2 读取mongodb数据
1.2.1 编写代码
package com.mongodb.spark
import org.apache.spark.sql.SparkSession
object ReadMongo {
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder()
.master("local")
.appName("MyApp")
.config("spark.mongodb.input.uri", "mongodb://127.0.0.1/test.user")
.getOrCreate()
// 设置log级别
spark.sparkContext.setLogLevel("WARN")
val df = MongoSpark.load(spark)
df.show()
df.createOrReplaceTempView("user")
val resDf = spark.sql("select name,age,sex from user")
resDf.show()
spark.stop()
System.exit(0)
}
}
运行结果:
1.3 读取mongo数据,使用Schema约束
package com.mongodb.spark
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}
object ReadMongoSchema {
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder()
.master("local")
.appName("MyApp")
.config("spark.mongodb.input.uri", "mongodb://127.0.0.1/test.user")
.getOrCreate()
// 设置log级别
spark.sparkContext.setLogLevel("WARN")
val schema = StructType(
List(
StructField("name", StringType),
StructField("age", IntegerType),
StructField("sex", StringType)
)
)
// 通过schema约束,直接获取需要的字段
val df = spark.read.format("com.mongodb.spark.sql").schema(schema).load()
df.show()
df.createOrReplaceTempView("user")
val resDf = spark.sql("select * from user")
resDf.show()
spark.stop()
System.exit(0)
}
}
输出结果
1.4 写入mongodb数据
package com.mongodb.spark
import org.apache.spark.sql.SparkSession
import org.bson.Document
object WriteMongo {
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder()
.master("local")
.appName("MyApp")
.config("spark.mongodb.output.uri", "mongodb://127.0.0.1/test.user")
.getOrCreate()
// 设置log级别
spark.sparkContext.setLogLevel("WARN")
val document1 = new Document()
document1.append("name", "sunshangxiang").append("age", 18).append("sex", "female")
val document2 = new Document()
document2.append("name", "diaochan").append("age", 24).append("sex", "female")
val document3 = new Document()
document3.append("name", "huangyueying").append("age", 23).append("sex", "female")
val seq = Seq(document1, document2, document3)
val df = spark.sparkContext.parallelize(seq)
// 将数据写入mongo
MongoSpark.save(df)
spark.stop()
System.exit(0)
}
}
输出结果:
参考文档:
Mongo on Spark Scalaspark大矩阵计算
参考:https://www.cnblogs.com/fang-jie/articles/6138789.html
输入:如下为稀疏表示方法第一列表示行,第二列表示列,第三列表示该位置的数值
M 1 1 1
M 1 3 5
M 2 2 7
M 3 1 6
M 3 3 9
M 4 1 2
M 4 2 10
N 1 1 1
N 1 3 3
N 1 5 5
N 2 2 6
N 2 3 9
N 2 4 8
N 2 5 7
N 3 2 10
N 3 4 12
代码:
package simrank
import org.apache.spark.sql.SparkSession
object simrankMaptest {
def test(spark: SparkSession): Unit ={
val path = "./sinrankmap.txt"
val mats = spark.sparkContext.textFile(path)
val firstMat = mats.filter(line =>line.contains("M"))
val secondMat = mats.filter(line =>line.contains("N"))
val firstItems = firstMat.map(line =>{
val lineSplit = line.split(" ")
(lineSplit(2),(lineSplit(0),lineSplit(1),lineSplit(3)))
})
val secondItems = secondMat.map(line =>{
val lineSplit = line.split(" ")
(lineSplit(1),(lineSplit(0),lineSplit(2),lineSplit(3)))
})
val newItems = firstItems.join(secondItems).values.map(v=>{
(v._1._2 + " " + v._2._2,v._1._3.toFloat * v._2._3.toFloat)
})
val result = newItems.reduceByKey((x,y)=>x+y)
result.collect().foreach(v=>print(v+"\n"))
}
}
输出:
(1 2,50.0)
(3 1,6.0)
(3 3,18.0)
(4 1,2.0)
(3 4,108.0)
(3 2,90.0)
(1 4,60.0)
(4 5,80.0)
(1 5,5.0)
(4 3,96.0)
(2 2,42.0)
(3 5,30.0)
(2 5,49.0)
(2 4,56.0)
(4 4,80.0)
(2 3,63.0)
(1 1,1.0)
(1 3,3.0)
(4 2,60.0)
How to transform a CompactBuffer to List
You can simply change CopmpactBuffer
to list with toList
method
val r_grp = rm.groupByKey
// After groupBykey
(2,CompactBuffer(Bhupesh, Amit, Ratan, Dinesh, Sheela))
(1,CompactBuffer(Lokesh, Pavan, Tejas, Kumar, Venkat))
Simply use toList to convert to List and sortBy to sort it
val com = r_grp.map(x => (x._1, x._2.toList.sortBy(x => x)))
com.foreach(println)
Output :
(1,List(Kumar, Lokesh, Pavan, Tejas, Venkat))
(2,List(Amit, Bhupesh, Dinesh, Ratan, Sheela))