1、创建 DataFrame
case class Data( val uid: Int ,
val tag: String ,
val info: String
)
val conf = new SparkConf( ) . setMaster( "local[2]" ) . setAppName( "DataLoading" )
val spark = SparkSession. builder( ) . config( conf) . getOrCreate( )
val dataRDD = spark. sparkContext. textFile( "加载地址" )
import spark. implicits. _
val dataFrame = dataRDD. map( lines => {
val x = lines. split( "," )
Data( x( 0 ) . trim. toInt, x( 1 ) . trim, x( 2 ) . trim)
} ) . toDF( )
spark. close( )
2、通过 DataFrame 将数据导入到 MongoDB。
val mongoClient = MongoClient( MongoClientURI( "mongodb://ip:27017/库名" ) )
mongoClient( "数据库名" ) ( "表名" ) . dropCollection( )
dataFrame. write
. option( "uri" , "mongodb://ip:27017/库名" )
. option( "collectiong" , "表名" )
. mode( "overwrite" )
. format( "com.mongodb.spark.sql" )
. save( )
mongoClient( "数据库名" ) ( "表名" ) . createIndex( MongoDBObject( "列名" - > 1 ) )
mongoClient. close( )
案例:将 Movies Rating Tag 导入 MongoDB 数据库中。
1、样本类封装 Model.scala:
package DataLoader
* *
* Movie【 电 影 数 据 表 】
* @param mid 电影的 ID
* @param name 电影的名称
* @param descri 电影的描述
* @param timelong 电影的时长
* @param issue 电影发布时间
* @param shoot 电影拍摄时间
* @param language 电影语言
* @param genres 电影所属类别
* @param actors 电影的演员
* @param director 电影的导演
* /
case class Movies( val mid: Int ,
val name: String ,
val descri: String ,
val timelong: String ,
val issue: String ,
val shoot: String ,
val language: String ,
val genres: String ,
val actors: String ,
val director: String
)
case class Ratings( val uid: Int ,
val mid: Int ,
val score: Double ,
val timestamp: Long )
case class Tags( val uid: Int ,
val mid: Int ,
val tag: String ,
val timestamp: Long
)
case class MongoConfig( val uri: String , val db: String )
2、DataLoad.scala:
package DataLoader
import com. mongodb. casbah. commons. MongoDBObject
import com. mongodb. casbah. { MongoClient, MongoClientURI}
import org. apache. spark. SparkConf
import org. apache. spark. sql. { DataFrame, SparkSession}
object DataLoad {
var MOVIES_COLLECTION_NAME = "Movies"
var RATINGS_COLLECTION_NAME = "Ratings"
var TAGS_COLLECTION_NAME = "Tags"
def main( args: Array[ String ] ) : Unit = {
var MOVIES_INPATH = "G:\\School\\Bigdata\\Info\\reco_data\\small\\movies.csv"
var RATINGS_INPATH = "G:\\School\\Bigdata\\Info\\reco_data\\small\\ratings.csv"
var TAGS_INPATH = "G:\\School\\Bigdata\\Info\\reco_data\\small\\tags.csv"
val params = scala. collection. mutable. Map[ String , Any ] ( )
params += ( ( "spark.cores" , "local[2]" ) )
params += "mongo.uri" - > "mongodb://192.168.109.141:27017/recom"
params += "mongo.db" - > "recom"
implicit val mongoConfig = new MongoConfig( params( "mongo.uri" ) . asInstanceOf[ String ]
, params( "mongo.db" ) . asInstanceOf[ String ] )
val conf = new SparkConf( ) . setMaster( params( "spark.core" ) . asInstanceOf[ String ] ) . setAppName( "DataLoading" )
val spark = SparkSession. builder( ) . config( conf) . getOrCreate( )
val moviesRDD = spark. sparkContext. textFile( MOVIES_INPATH)
val ratingsRDD = spark. sparkContext. textFile( RATINGS_INPATH)
val tagsRDD = spark. sparkContext. textFile( TAGS_INPATH)
import spark. implicits. _
val moviesDF = moviesRDD. map( lines => {
val x = lines. split( "\\^" )
Movies( x( 0 ) . trim. toInt, x( 1 ) . trim, x( 2 ) . trim, x( 3 ) . trim, x( 4 ) . trim, x( 5 ) . trim,
x( 6 ) . trim, x( 7 ) . trim, x( 8 ) . trim, x( 9 ) . trim)
} ) . toDF( )
val ratingsDF = ratingsRDD. map( lines => {
val x = lines. split( "," )
Ratings( x( 0 ) . trim. toInt, x( 1 ) . trim. toInt, x( 2 ) . trim. toDouble, x( 3 ) . trim. toLong)
} ) . toDF( )
val tagsDF = tagsRDD. map( lines => {
val x = lines. split( "," )
Tags( x( 0 ) . trim. toInt, x( 1 ) . trim. toInt, x( 2 ) . trim, x( 3 ) . trim. toLong)
} ) . toDF( )
saveData2Mongo( moviesDF, ratingsDF, tagsDF)
spark. close( )
}
private def saveData2Mongo( moviesDF: DataFrame, ratingsDF: DataFrame,
tagsDF: DataFrame) ( implicit mongoConfig: MongoConfig) : Unit = {
val mongoClient = MongoClient( MongoClientURI( mongoConfig. uri) )
mongoClient( mongoConfig. db) ( MOVIES_COLLECTION_NAME) . dropCollection( )
mongoClient( mongoConfig. db) ( RATINGS_COLLECTION_NAME) . dropCollection( )
mongoClient( mongoConfig. db) ( TAGS_COLLECTION_NAME) . dropCollection( )
moviesDF. write
. option( "uri" , mongoConfig. uri)
. option( "collectiong" , MOVIES_COLLECTION_NAME)
. mode( "overwrite" )
. format( "com.mongodb.spark.sql" )
. save( )
ratingsDF. write
. option( "uri" , mongoConfig. uri)
. option( "collectiong" , RATINGS_COLLECTION_NAME)
. mode( "overwrite" )
. format( "com.mongodb.spark.sql" )
. save( )
tagsDF. write
. option( "uri" , mongoConfig. uri)
. option( "collectiong" , TAGS_COLLECTION_NAME)
. mode( "overwrite" )
. format( "com.mongodb.spark.sql" )
. save( )
mongoClient( mongoConfig. db) ( MOVIES_COLLECTION_NAME) . createIndex( MongoDBObject( "mid" - > 1 ) )
mongoClient( mongoConfig. db) ( RATINGS_COLLECTION_NAME) . createIndex( MongoDBObject( "mid" - > 1 ) )
mongoClient( mongoConfig. db) ( RATINGS_COLLECTION_NAME) . createIndex( MongoDBObject( "uid" - > 1 ) )
mongoClient( mongoConfig. db) ( TAGS_COLLECTION_NAME) . createIndex( MongoDBObject( "mid" - > 1 ) )
mongoClient( mongoConfig. db) ( TAGS_COLLECTION_NAME) . createIndex( MongoDBObject( "uid" - > 1 ) )
mongoClient. close( )
}
}