spark操作文件、mysql、hbase

最新推荐文章于 2021-12-15 22:07:26 发布

Master_slaves

最新推荐文章于 2021-12-15 22:07:26 发布

阅读量149

点赞数

分类专栏：大数据

本文链接：https://blog.csdn.net/master_chaoandqi/article/details/101022754

版权

大数据专栏收录该内容

26 篇文章 0 订阅

订阅专栏

文章目录

一读写文件
二读写至mysql
三读写至hbase

一读写文件

object ReadFileAndSaveAsFile {
  def main(args: Array[String]): Unit = {
    //从hadoop 本地文件系统读取数据保存到hdfs上
    // 1 创建sparkContext
    val conf = new SparkConf()
        conf.setAppName("RddFilter")
        conf.setMaster("local[2]")
    val path:String="D:\\sparkDemo\\wordCount.txt"
    val sc = new SparkContext(conf)
    // 2 读取文件
    val rdd = sc.textFile(path)
    // 3 weordcount
    //转换算子
    val result = rdd.map(line => {
      line.split("\\W+")
    }).flatMap(it=>it)
      .map((_,1))
      .reduceByKey(_+_)
      .sortBy(_._2,ascending = false)
    //行动算子
    // 4 保存数据到hdfs
    result.saveAsTextFile("hdfs://hadoop102:9000/spark-demo")
    sc.stop()
  }
}

如何切片？
查看源代码：

val rdd = sc.textFile(path)
进入 
==>org.apache.spark.SparkContext#textFile
方法描述：
 /**
   * Read a text file from HDFS, a local file system (available on all nodes), or any
   * Hadoop-supported file system URI, and return it as an RDD of Strings.
   */
  def textFile(
      path: String,
      minPartitions: Int = defaultMinPartitions): RDD[String] = withScope {
    assertNotStopped()
    hadoopFile(path, classOf[TextInputFormat], classOf[LongWritable], classOf[Text],
      minPartitions).map(pair => pair._2.toString).setName(path)
  }
第一步： 先找出defaultMinPartitions 最小分区数十如何定义的
      ①  org.apache.spark.SparkContext#defaultMinPartitions 中声明的最小分区数
        def defaultMinPartitions: Int = math.min(defaultParallelism, 2)
      ②  找到这个defaultParallelism设置的是什么
        org.apache.spark.SparkContext#defaultParallelism
      方法签名：
       /** Default level of parallelism to use when not given by user (e.g. parallelize and makeRDD). */
      def defaultParallelism: Int = {
        assertNotStopped()
        taskScheduler.defaultParallelism
      }
      继续找taskScheduler.defaultParallelism 的值怎么设置的
      org.apache.spark.scheduler.TaskScheduler#defaultParallelism
      方法签名：
      // Get the default level of parallelism to use in the cluster, as a hint for sizing jobs.
        def defaultParallelism(): Int  // 方法声明为抽象方法
        该类为特质private[spark] trait TaskScheduler
      找该类的实现类：
      org.apache.spark.scheduler.TaskSchedulerImpl#defaultParallelism
      方法签名
      override def defaultParallelism(): Int = backend.defaultParallelism()
      找backend 对应的defaultParallelism 方法

      org.apache.spark.scheduler.SchedulerBackend#defaultParallelism  //SchedulerBackend 该类仍然为抽象类
      找它的实现类：org.apache.spark.scheduler.local.LocalSchedulerBackend  本地模式
      override def defaultParallelism(): Int =
    scheduler.conf.getInt("spark.default.parallelism", totalCores) // 从配置中读取。由于我们没有设置，则取的是默认值，默认值为当前系统的核数 设置的local[3]
结论：
 def defaultMinPartitions: Int = math.min(defaultParallelism, 2) 通过debug得到defaultParallelism 这个值为3 两个取小所以 最小分区数为2 

第二步：查看切片机制
 hadoopFile(path, classOf[TextInputFormat], classOf[LongWritable], classOf[Text],
      minPartitions).map(pair => pair._2.toString).setName(path)
    进入org.apache.spark.SparkContext#hadoopFile
   一直进入，查看HadoopRdd 的getPartitions方法
   org.apache.spark.rdd.HadoopRDD#getPartitions
   方法签名
    override def getPartitions: Array[Partition] = {
    val jobConf = getJobConf()
    // add the credentials here as this can be called before SparkContext initialized
    SparkHadoopUtil.get.addCredentials(jobConf)
    val inputFormat = getInputFormat(jobConf)
    val inputSplits = inputFormat.getSplits(jobConf, minPartitions)
    val array = new Array[Partition](inputSplits.size)
    for (i <- 0 until inputSplits.size) {
      array(i) = new HadoopPartition(id, i, inputSplits(i))
    }
    array
  }
  // 查看切片：
   val inputSplits = inputFormat.getSplits(jobConf, minPartitions)
    输入的inputformat类型为  TextInputFormat  hadoopFile(path, classOf[TextInputFormat], classOf[LongWritable], classOf[Text],  minPartitions).map(pair => pair._2.toString).setName(path)
    没有实现getSplits方法，查看父类fileInputformat
关键代码如下：public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {}  // minPartitions 和numSplits 的值是一样的

FileStatus[] files = this.listStatus(job); // 列出所有的文件信息
long totalSize = 0L;
for(int i$ = 0; i$ < len$; ++i$) {
     FileStatus file = arr$[i$];
    if (file.isDirectory()) {  // 传入的可以是目录但是不能包含子目录，否则会抛出以下异常
        throw new IOException("Not a file: " + file.getPath()); 
       }

     totalSize += file.getLen(); // 计算出文件的总大小
}

 long goalSize = totalSize / (long)(numSplits == 0 ? 1 : numSplits); // 传入的numSplits数为2 计算出goalSize 为 总大小/2

 long minSize = Math.max(job.getLong("mapreduce.input.fileinputformat.split.minsize", 1L), this.minSplitSize);  // 查看minSplitSize 是如何计算的 private long minSplitSize = 1L; 
 // 为设置mapreduce.input.fileinputformat.split.minsize这个的值，所以上面代码执行完为1 
切片逻辑如下：

 for(int i$ = 0; i$ < len$; ++i$) { // 遍历所有的文件
            FileStatus file = arr$[i$];  
            Path path = file.getPath(); 
            long length = file.getLen(); // 获取文件的长度
            if (length == 0L) {
                splits.add(this.makeSplit(path, 0L, length, new String[0]));  
            } else { 
                FileSystem fs = path.getFileSystem(job);
                BlockLocation[] blkLocations;
                //获取文件块信息
                if (file instanceof LocatedFileStatus) { 
                    blkLocations = ((LocatedFileStatus)file).getBlockLocations();
                } else {
                    blkLocations = fs.getFileBlockLocations(file, 0L, length);
                }

                if (!this.isSplitable(fs, path)) { // 判断文件是否可切
                    String[] splitHosts = this.getSplitHosts(blkLocations, 0L, length, clusterMap);
                    splits.add(this.makeSplit(path, 0L, length, splitHosts));
                } else { 
                    long blockSize = file.getBlockSize(); // 获取文件 本地模式下 32M 集群模式下128M
                    long splitSize = this.computeSplitSize(goalSize, minSize, blockSize); // //计算切片大小 计算完的切片大小为 1259 
                    /*
                        本次传入的文件总大小为2518 goalSize 计算为1259  minSize 为1 blockSize 为33554432
                        protected long computeSplitSize(long goalSize, long minSize, long blockSize) {
                              return Math.max(minSize, Math.min(goalSize, blockSize));
                         }

                    */

                    long bytesRemaining;
                    String[] splitHosts;
                    // 2519 / 1259  显然大于1.1倍所以会被切片
                    for(bytesRemaining = length; (double)bytesRemaining / (double)splitSize > 1.1D; bytesRemaining -= splitSize) { //判断文件总长度是否大于切片大小的1.1倍 大于则进行切片   

                        splitHosts = this.getSplitHosts(blkLocations, length - bytesRemaining, splitSize, clusterMap);
                        splits.add(this.makeSplit(path, length - bytesRemaining, splitSize, splitHosts));
                    }
 
                    if (bytesRemaining != 0L) { //bytesRemaining -= splitSize  bytesRemaining 1259  为切完之后再比较一次，不等于0 则再切一次
                        splitHosts = this.getSplitHosts(blkLocations, length - bytesRemaining, bytesRemaining, clusterMap);
                        splits.add(this.makeSplit(path, length - bytesRemaining, bytesRemaining, splitHosts));
                    }
                }
            }
        }
总结：切片时根据文件大小，最小分区数2 以及blockSize 生成切片规划

//假如 四个文件 
    文件大小为3 4 5 35
    文件总大小为： 47
    goalSize的大小为 23.3
    local模式下：块大小为32M 
    会被切成几片？5片
protected long computeSplitSize(long goalSize, long minSize, long blockSize) {
   return Math.max(minSize, Math.min(goalSize, blockSize));
 }

sc.sequenceFile 读取sequenceFile文件
sc.saveAsSequenceFile 保存为sequenceFile文件

二读写至mysql

添加mysql的依赖

<dependencies>
    <dependency>
        <groupId>mysql</groupId>
        <artifactId>mysql-connector-java</artifactId>
        <version>5.1.27</version>
    </dependency>

2.1 从mysql中读取数据

2.1.1 使用spark提供的JDBCRDD

JDBCRDD签名：

 /*
    The query must contain two ? placeholders for parameters used to partition the results.
 *   For example,
 *   {{{
 *   select title, author from books where ? <= id and id <= ?
 *   }}}
    class JdbcRDD[T: ClassTag](
    sc: SparkContext,
    getConnection: () => Connection,
    sql: String,
    lowerBound: Long,
    upperBound: Long,
    numPartitions: Int,
    mapRow: (ResultSet) => T = JdbcRDD.resultSetToObjectArray _)
  extends RDD[T](sc, Nil) with Logging {
     */

object SparkJdbcDemo {
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf()
        conf.setAppName("SparkJdbcDemo")
        conf.setMaster("local[2]")
      val sc = new SparkContext(conf)
    val driver = "com.mysql.jdbc.Driver"
    val url = "jdbc:mysql://hadoop102:3306/sparkdemo"
    val userName = "root"
    val passWd = "root"

    // 读取数据
    val jdbcRdd = new JdbcRDD(
      sc,
      () => {
        Class.forName(driver)
        DriverManager.getConnection(url, userName, passWd)
      }
      ,
      "select id, name,sex from customer where id >=? and id <= ?",
      1,
      10,
      2,
      result => {
        (Customer(result.getInt(1), result.getString(2), result.getString(3)))
      }
    )
    jdbcRdd.collect().foreach(println)
  sc.stop()
  }
}
case class Customer(id:Int,name:String,age:String )

2.1.2 使用原生的JDBC连接数据库查询封装为RDD集合

object SparkJdbcDemo1 {
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf()
        conf.setAppName("SparkJdbcDemo")
        conf.setMaster("local[2]")
      val sc = new SparkContext(conf)
    val driver = "com.mysql.jdbc.Driver"
    val url = "jdbc:mysql://hadoop102:3306/sparkdemo"
    val userName = "root"
    val passWd = "root"

    val sql:String="select id, name,sex from customer where id >=? and id <= ?"
    Class.forName(driver)
    val conn = DriverManager.getConnection(url, userName, passWd)
    val st = conn.prepareStatement(sql)
    st.setInt(1,1);
    st.setInt(2,10);
     val res = st.executeQuery()
    var array = new ArrayBuffer[Customer1]();
    while(res.next()){
      array +=Customer1(
        res.getInt("id"),
        res.getString("name"),
        res.getString("sex")
      )

    }
    val rdd = sc.makeRDD(array)
    rdd.collect().foreach(println)
   sc.stop()
  }
}
case class Customer1(id:Int,name:String,age:String )

输出结果：

Customer1(1,zhangsan,man)
Customer1(2,lisi,woman)
Customer1(3,lisi,woman)
Customer1(4,lisi,woman)
Customer1(5,lisi,woman)
Customer1(6,lisi,woman)
Customer1(7,lisi,woman)
Customer1(8,lisi,woman)
Customer1(9,lisi,woman)
Customer1(10,zhangsan10,man)

2.2 Spark批量写入数据到mysql

批次写入

object SparkWriteToMySql1 {
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf()
    conf.setAppName("SparkWriteToMySql1")
    conf.setMaster("local[2]")
    val sc = new SparkContext(conf)
    val driver = "com.mysql.jdbc.Driver"
    val url = "jdbc:mysql://hadoop102:3306/sparkdemo"
    val userName = "root"
    val passWd = "root"
    Class.forName(driver)
    val sql= "insert into customer values(?,?,?)"

val customers = new ArrayBuffer[Customer1]()
    for (elem <- 2001 to 4000) {
     customers += Customer1(
       elem,
       "lisi"+elem,
       "man"
     )
    }
    val customerRdd = sc.makeRDD(customers,2)
    //方法的执行是在executer端执行的，为了防止频繁的建立连接，在这使用mapPartitions 一个分区一个分区的往mysql中写
    customerRdd.foreachPartition(it=>{
      val conn = DriverManager.getConnection(url, userName, passWd)
      val st = conn.prepareStatement(sql)
      conn.setAutoCommit(false); //取消自动提交事务
      it.foreach(t=>{
        st.setInt(1,t.id)
        st.setString(2,t.name)
        st.setString(3,t.age)
        st.addBatch() // 写入的时候 使用批次进行写入
      })
      val ints = st.executeBatch()
      println(ints.size)
      conn.commit() // 提交事务
      st.close()
      conn.close()
    })
    sc.stop()
  }
}

三读写至hbase

pom依赖

<dependency>
        <groupId>org.apache.hbase</groupId>
        <artifactId>hbase-server</artifactId>
        <version>1.3.1</version>
    </dependency>

    <dependency>
        <groupId>org.apache.hbase</groupId>
        <artifactId>hbase-client</artifactId>
        <version>1.3.1</version>
    </dependency>

3.1 通过spark提供的newAPIHadoopRDD读取Hbase的数据

package com.gc.spark.day04.hbase

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.hbase.{CellUtil, HBaseConfiguration}
import org.apache.hadoop.hbase.client.Result
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import org.apache.hadoop.hbase.mapreduce.TableInputFormat
import org.apache.hadoop.hbase.util.Bytes
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}

object HbaseReadDemo {
  def main(args: Array[String]): Unit = {
    // 从hbase 中读取数据
       val conf = new SparkConf()
           conf.setAppName("HbaseReadDemo")
           conf.setMaster("local[2]")
           val sc = new SparkContext(conf)
    val hbaseConf: Configuration = HBaseConfiguration.create()
    hbaseConf.set("hbase.zookeeper.quorum", "hadoop102,hadoop103,hadoop104")
    hbaseConf.set(TableInputFormat.INPUT_TABLE, "customer")

//    def newAPIHadoopRDD[K, V, F <: NewInputFormat[K, V]](
//                                                          conf: Configuration = hadoopConfiguration,
//                                                          fClass: Class[F],
//                                                          kClass: Class[K],
//                                                          vClass: Class[V]): RDD[(K, V)] = withScope {

    val hbaseRdd:RDD[(ImmutableBytesWritable,Result)] = sc.newAPIHadoopRDD(
      hbaseConf,
      classOf[TableInputFormat],
      classOf[ImmutableBytesWritable],
      classOf[Result]
    )
    val hbaseRddRe: RDD[Array[(String, String, String, String)]] = hbaseRdd.map({
      case (key: ImmutableBytesWritable, rs: Result) => {
        val ke = Bytes.toString(key.get())
        rs.rawCells().map(cell => {
          (ke,
            Bytes.toString( CellUtil.cloneFamily(cell)),
            Bytes.toString(CellUtil.cloneQualifier(cell)),
            Bytes.toString(CellUtil.cloneValue(cell))
          )
        })
      }
    })
    hbaseRddRe.flatMap(it=>it).collect().foreach(println)
    sc.stop()
  }
}

执行结果：
查询结果

3.2 使用hbase的Api查询根据固定Rowkey查询

def  queryDataByRowKey(rowkey:String,tableName:String,conn: Connection) ={
  val admin: Admin = conn.getAdmin
    if (!admin.tableExists(TableName.valueOf(tableName))) {
      throw  new RuntimeException("table is not exists")
    }
    val table: Table = conn.getTable(TableName.valueOf(tableName))

    val get: Get = new Get(Bytes.toBytes(rowkey))
    val result: Result = table.get(get)
    // 解析结果
    val cells: Array[Cell] = result.rawCells()
    val array = cells.map(cell => {
      (Bytes.toString(CellUtil.cloneRow(cell)), // 行号
        Bytes.toString(CellUtil.cloneFamily(cell)), //列族
        Bytes.toString(CellUtil.cloneQualifier(cell)), // 列名
        Bytes.toString(CellUtil.cloneValue(cell)) // 值
      )
    })

    table.close();
    admin.close()
    array.toList
  }

根据rowKey为1查询

3.3 根据rowKey 扫描查询

def  scanDataByRowkey(tableName:String,conn: Connection,startRow:String,stopRow:String): ArrayBuffer[(String, String, String, String)] ={
    val admin: Admin = conn.getAdmin // 表的操作 admin对象
    if (!admin.tableExists(TableName.valueOf(tableName))) {
      throw  new RuntimeException("table is not exists")
    }
    val table: Table = conn.getTable(TableName.valueOf(tableName)) // 表的查询 Table对象
    val scan: Scan = new Scan()
    scan.addColumn(Bytes.toBytes("info"),Bytes.toBytes("name"))
    scan.addColumn(Bytes.toBytes("info"),Bytes.toBytes("age"))
    scan.addColumn(Bytes.toBytes("info"),Bytes.toBytes("sex"))
    scan.setStartRow(Bytes.toBytes(startRow))
    //
    // 注意这个地方查询的时候 由于rowkey是字符串  查询 scanDataByRowkey("customer",conn,"1","50")
    // 这个匹配的数据为开始为1 到以50开头之间的所有数据  因为是按照rowkwy的字典顺序 排序的
    scan.setStopRow(Bytes.toBytes(stopRow))
    scan.setMaxResultSize(Long.MaxValue) // 测试设置 返回的行数
    val scanner: ResultScanner = table.getScanner(scan)
    val scIt: util.Iterator[Result] = scanner.iterator();
   val buffer: ArrayBuffer[(String, String, String, String)] = new ArrayBuffer[(String,String,String,String)]()
    while(scIt.hasNext) {
      val result: Result = scIt.next()
      val cells: Array[Cell] = result.rawCells()
     cells.foreach(cell => {
      buffer.append(
        (
          Bytes.toString(CellUtil.cloneRow(cell)), // rowkey的值
          Bytes.toString(CellUtil.cloneFamily(cell)), // 列族的值
          Bytes.toString(CellUtil.cloneQualifier(cell)), //列的名
          Bytes.toString(CellUtil.cloneValue(cell)) // value的值
        )
      )
    })
  }
    admin.close()
    table.close()
    buffer
  }

3.4 程序入口

def main(args: Array[String]): Unit = {

    val conf = new SparkConf()
        conf.setAppName("HBaseReadDemo2")
        conf.setMaster("local[2]")
        val sc = new SparkContext(conf)
    //连接hbase 查询表中的数据  使用hbase的查询结果 对查询的结果转化为Rdd集合
    val hbaseConf: Configuration = HBaseConfiguration.create()
    hbaseConf.set("hbase.zookeeper.quorum", "hadoop102,hadoop103,hadoop104")
    // 创建连接
    import org.apache.hadoop.hbase.client.ConnectionFactory
    val conn: Connection = ConnectionFactory.createConnection(hbaseConf)

    //根据rowKey查询单条数据
    val list: List[(String, String, String, String)] = queryDataByRowKey("1","customer",conn)
    val rdd: RDD[(String, String, String, String)] = sc.makeRDD(list)
    rdd.collect().foreach(println)

    // 查询多条数据

//    val buffer: ArrayBuffer[(String, String, String, String)] = scanDataByRowkey("customer",conn,"1","50")
//
//    val rdd1 = sc.makeRDD(buffer)
//    rdd1.collect().foreach(println)

    sc.stop()
  }

3.5总结

java和scala集合之间的相互转换

 import scala.collection.JavaConverters._
puts.toList.asJava //将scala 集合转换为java 中的集合

-注意在对Rdd集合进行处理的时候，代码的执行位置否则会报需要序列化错