spark scala-读取各类数据源

本文章主要通过scala实现spark读取各类数据源

1 读取hive数据

/**
 * @author jhp
  *         使用spark读取Hive数据
 */
object HiveDataSource {
  
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf()
        .setAppName("HiveDataSource");
    val sc = new SparkContext(conf);
    val hiveContext = new HiveContext(sc);
    
    hiveContext.sql("DROP TABLE IF EXISTS student_infos");
    hiveContext.sql("CREATE TABLE IF NOT EXISTS student_infos (name STRING, age INT)");
    hiveContext.sql("LOAD DATA "
        + "LOCAL INPATH '/usr/local/spark-study/resources/student_infos.txt' "
        + "INTO TABLE student_infos");
    
    hiveContext.sql("DROP TABLE IF EXISTS student_scores"); 
    hiveContext.sql("CREATE TABLE IF NOT EXISTS student_scores (name STRING, score INT)");  
    hiveContext.sql("LOAD DATA "
        + "LOCAL INPATH '/usr/local/spark-study/resources/student_scores.txt' "
        + "INTO TABLE student_scores");
    
    val goodStudentsDF = hiveContext.sql("SELECT si.name, si.age, ss.score "
        + "FROM student_infos si "
        + "JOIN student_scores ss ON si.name=ss.name "
        + "WHERE ss.score>=80");
    
    hiveContext.sql("DROP TABLE IF EXISTS good_student_infos");  
    goodStudentsDF.saveAsTable("good_student_infos");  
    
    val goodStudentRows = hiveContext.table("good_student_infos").collect();  
    for(goodStudentRow <- goodStudentRows) {
      println(goodStudentRow);  
    }
  }
  
}

2 读取json数据

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.types.StructField
import org.apache.spark.sql.types.StringType
import org.apache.spark.sql.types.IntegerType
import org.apache.spark.sql.Row
import org.apache.spark.sql.types.LongType


/**
 * @author jhp
  *         使用spark读取json文件
 */
object JSONDataSource {
  
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf()
        .setAppName("JSONDataSource")  
    val sc = new SparkContext(conf)
    val sqlContext = new SQLContext(sc)
    
    // 创建学生成绩DataFrame
    val studentScoresDF = sqlContext.read.json("hdfs://spark1:9000/spark-study/students.json")
    
    // 查询出分数大于80分的学生成绩信息,以及学生姓名
    studentScoresDF.registerTempTable("student_scores")
    val goodStudentScoresDF = sqlContext.sql("select name,score from student_scores where score>=80")
    val goodStudentNames = goodStudentScoresDF.rdd.map { row => row(0) }.collect()  
    
    // 创建学生基本信息DataFrame
    val studentInfoJSONs = Array("{\"name\":\"Leo\", \"age\":18}", 
        "{\"name\":\"Marry\", \"age\":17}",
        "{\"name\":\"Jack\", \"age\":19}")
    val studentInfoJSONsRDD = sc.parallelize(studentInfoJSONs, 3);
    val studentInfosDF = sqlContext.read.json(studentInfoJSONsRDD)  
    
    // 查询分数大于80分的学生的基本信息
    studentInfosDF.registerTempTable("student_infos")
    
    var sql = "select name,age from student_infos where name in ("
    for(i <- 0 until goodStudentNames.length) {
      sql += "'" + goodStudentNames(i) + "'"
      if(i < goodStudentNames.length - 1) {
        sql += ","
      }
    }
    sql += ")"  
    
    val goodStudentInfosDF = sqlContext.sql(sql)
    
    // 将分数大于80分的学生的成绩信息与基本信息进行join
    val goodStudentsRDD = 
        goodStudentScoresDF.rdd.map { row => (row.getAs[String]("name"), row.getAs[Long]("score")) }
            .join(goodStudentInfosDF.rdd.map { row => (row.getAs[String]("name"), row.getAs[Long]("age")) })  
  
    // rdd转换为dataframe
    val goodStudentRowsRDD = goodStudentsRDD.map(
        info => Row(info._1, info._2._1.toInt, info._2._2.toInt))  
            
    val structType = StructType(Array(
        StructField("name", StringType, true),
        StructField("score", IntegerType, true),
        StructField("age", IntegerType, true)))  
        
    val goodStudentsDF = sqlContext.createDataFrame(goodStudentRowsRDD, structType)  
    
    // dataframe中的数据保存到json    goodStudentsDF.write.format("json").save("hdfs://spark1:9000/spark-study/good-students-scala")  
  }
  
}

3 读取parquet数据

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.SaveMode

/**
 * @author jhp
  *         spark操作parquet格式数据
 */
object ParquetMergeSchema {
  
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf()
        .setAppName("ParquetMergeSchema")  
    val sc = new SparkContext(conf)
    val sqlContext = new SQLContext(sc)  
    
    import sqlContext.implicits._  
    
    // 创建一个DataFrame,作为学生的基本信息,并写入一个parquet文件中
    val studentsWithNameAge = Array(("leo", 23), ("jack", 25)).toSeq
    val studentsWithNameAgeDF = sc.parallelize(studentsWithNameAge, 2).toDF("name", "age")  
    studentsWithNameAgeDF.save("hdfs://spark1:9000/spark-study/students", "parquet", SaveMode.Append)
    
    // 创建第二个DataFrame,作为学生的成绩信息,并写入一个parquet文件中
    val studentsWithNameGrade = Array(("marry", "A"), ("tom", "B")).toSeq    
    val studentsWithNameGradeDF = sc.parallelize(studentsWithNameGrade, 2).toDF("name", "grade")  
    studentsWithNameGradeDF.save("hdfs://spark1:9000/spark-study/students", "parquet", SaveMode.Append)
  
    // 首先,第一个DataFrame和第二个DataFrame的元数据肯定是不一样的吧
    // 一个是包含了nameage两个列,一个是包含了namegrade两个列
    // 所以, 这里期望的是,读取出来的表数据,自动合并两个文件的元数据,出现三个列,nameagegrade
    
    // mergeSchema的方式,读取students表中的数据,进行元数据的合并
    val students = sqlContext.read.option("mergeSchema", "true")
        .parquet("hdfs://spark1:9000/spark-study/students")
    students.printSchema()
    students.show()  
  }
  
}

阅读更多
文章标签: spark scala 
个人分类: Spark知识汇合篇
上一篇spark scala-transformation基础操作
下一篇spark scala-distinct使用
想对作者说点什么? 我来说一句

没有更多推荐了,返回首页

关闭
关闭