问题总结:
面临场景
Spark面对海量数据的增量更新也是乏力的,例如一批数据初始化为4000万,大约10G,
日增量100MB(增量数据分为两部分:新增与老数据变更),那么要如何才能将这100MB数据
更新至数据库中?
之前方案
- 加载日增量100MB
- 加载库中10G
- 使用left join求出增量部分
- 增量部分写入库中
小结:面对上述问题,尽管我们是做大数据的也束手无策,只能牺牲性能来换取业务增量。
解决方案
- 重点1:可以准确获取日增量数据
- 重点2:MongoDB特性,_id
- 重点3:Spark DataFrame.write append 特性
重点摘抄分析
override def createRelation(sqlContext: SQLContext, mode: SaveMode, parameters: Map[String, String], data: DataFrame): BaseRelation = {
val writeConfig = WriteConfig(sqlContext.sparkContext.getConf, parameters)
val mongoConnector = MongoConnector(writeConfig.asOptions)
lazy val collectionExists: Boolean = mongoConnector.withDatabaseDo(
writeConfig, { db => db.listCollectionNames().asScala.toList.contains(writeConfig.collectionName) }
)
mode match {
case Append => MongoSpark.save(data, writeConfig)
case Overwrite =>
// 如果使用 Overwrite 会发生集合删除操作,慎用
mongoConnector.withCollectionDo(writeConfig, { collection: MongoCollection[Document] => collection.drop() })
MongoSpark.save(data, writeConfig)
case ErrorIfExists =>
if (collectionExists) {
throw new UnsupportedOperationException("MongoCollection already exists")
} else {
MongoSpark.save(data, writeConfig)
}
case Ignore =>
if (!collectionExists) {
MongoSpark.save(data, writeConfig)
}
}
constructRelation(sqlContext, parameters ++ writeConfig.asOptions, Some(data.schema))
}
/**
* Save data to MongoDB
*
* 注意这段注释,如果dataFrame包含_id字段,则数据将被替换集合中任何现有文档
* '''Note:''' If the dataFrame contains an `_id` field the data will upserted and replace any existing documents in the collection.
*
* @param dataset the dataset to save to MongoDB
* @param writeConfig the writeConfig
* @tparam D
* @since 1.1.0
*/
def save[D](dataset: Dataset[D], writeConfig: WriteConfig): Unit = {
val mongoConnector = MongoConnector(writeConfig.asOptions)
val dataSet = dataset.toDF()
val mapper = rowToDocumentMapper(dataSet.schema, writeConfig.extendedBsonTypes)
val documentRdd: RDD[BsonDocument] = dataSet.rdd.map(row => mapper(row))
val fieldNames = dataset.schema.fieldNames.toList
val queryKeyList = BsonDocument.parse(writeConfig.shardKey.getOrElse("{_id: 1}")).keySet().asScala.toList
if (writeConfig.forceInsert || !queryKeyList.forall(fieldNames.contains(_))) {
MongoSpark.save(documentRdd, writeConfig)
} else {
documentRdd.foreachPartition(iter => if (iter.nonEmpty) {
mongoConnector.withCollectionDo(writeConfig, { collection: MongoCollection[BsonDocument] =>
iter.grouped(writeConfig.maxBatchSize).foreach(batch => {
val requests = batch.map(doc =>
if (queryKeyList.forall(doc.containsKey(_))) {
val queryDocument = new BsonDocument()
queryKeyList.foreach(key => queryDocument.append(key, doc.get(key)))
if (writeConfig.replaceDocument) {
new ReplaceOneModel[BsonDocument](queryDocument, doc, new ReplaceOptions().upsert(true))
} else {
queryDocument.keySet().asScala.foreach(doc.remove(_))
new UpdateOneModel[BsonDocument](queryDocument, new BsonDocument("$set", doc), new UpdateOptions().upsert(true))
}
} else {
new InsertOneModel[BsonDocument](doc)
})
collection.bulkWrite(requests.toList.asJava, new BulkWriteOptions().ordered(writeConfig.ordered))
})
})
})
}
}
有了上述特性,程序设计的时候找到唯一表示列作为_id即可。
以 上 来 源 于 项 目 组 老 大 分 享 文 档 \color{#FF3030}{以上来源于项目组老大分享文档} 以上来源于项目组老大分享文档