spark批量读取阿里云tablestore区间数据

1.scala读取tablestore数据

起始主键和结束主键均需构建

primaryKeyBuilder = PrimaryKeyBuilder.createPrimaryKeyBuilder()

 否则会报错rowkey重复。

import com.alicloud.openservices.tablestore.SyncClient
import com.alicloud.openservices.tablestore.model._

import scala.collection.mutable.{ArrayBuffer, ListBuffer}
import scala.collection.JavaConversions._

case class OtsReadUtils() {

  def otsRead(tableName: String, pkName: String, startPkValue: String, endPkValue: String): ListBuffer[Row] = {
    val client = init
    //getRow(client, "31", "id")
    getRange(client, tableName, pkName, startPkValue, endPkValue)
  }

  def init: SyncClient = {
    val endPoint = "http://dxl.com.cn"
    val accessKeyId = "keyid"
    val accessKeySecret = "333333333333"
    val instanceName = "ssz"
    new SyncClient(endPoint, accessKeyId, accessKeySecret, instanceName)
  }

  def getRow(client: SyncClient, pkValue: String, pkName: String): Unit = { //构造主键。
    val primaryKeyBuilder = PrimaryKeyBuilder.createPrimaryKeyBuilder
    primaryKeyBuilder.addPrimaryKeyColumn(pkName, PrimaryKeyValue.fromString(pkValue))
    val primaryKey = primaryKeyBuilder.build
    //读取⼀⾏数据。
    val criteria = new SingleRowQueryCriteria("test03", primaryKey) // table name
    //设置读取最新版本。
    criteria.setMaxVersions(1)
    var getRowResponse = client.getRow(new GetRowRequest(criteria))
    var row = getRowResponse.getRow
    println("读取完毕,结果为: ")
    //println(row)
    //设置读取某些列。
    criteria.addColumnsToGet("Col0") // column name
    getRowResponse = client.getRow(new GetRowRequest(criteria))
    row = getRowResponse.getRow
    println("读取完毕,结果为:")
    //println(row)
  }

  /**
   * FF yyyyMMdd device_id hhmiss
   *
   * @param client
   * @param startPkValue
   * @param endPkValue
   */
  def getRange(client: SyncClient, tableName: String, pkName: String, startPkValue: String, endPkValue: String): ListBuffer[Row] = {
    val rangeRowQueryCriteria: RangeRowQueryCriteria = new RangeRowQueryCriteria(tableName)
    //设置起始主键。
    var primaryKeyBuilder: PrimaryKeyBuilder = PrimaryKeyBuilder.createPrimaryKeyBuilder()
    primaryKeyBuilder.addPrimaryKeyColumn(pkName, PrimaryKeyValue.fromString(startPkValue))
    rangeRowQueryCriteria.setInclusiveStartPrimaryKey(primaryKeyBuilder.build())
    //设置结束主键。
    primaryKeyBuilder = PrimaryKeyBuilder.createPrimaryKeyBuilder()
    primaryKeyBuilder.addPrimaryKeyColumn(pkName, PrimaryKeyValue.fromString(endPkValue));
    rangeRowQueryCriteria.setExclusiveEndPrimaryKey(primaryKeyBuilder.build());
    rangeRowQueryCriteria.setMaxVersions(1);

    println("GetRange的结果为:");
    var list = ListBuffer[Row]()
    while (true) {
      val getRangeResponse: GetRangeResponse = client.getRange(new GetRangeRequest(rangeRowQueryCriteria));
      list = list ++ getRangeResponse.getRows()
      //list.foreach(row => println(row))

      //如果nextStartPrimaryKey不为null,则继续读取。
      if (getRangeResponse.getNextStartPrimaryKey() != null) {
        rangeRowQueryCriteria.setInclusiveStartPrimaryKey(getRangeResponse.getNextStartPrimaryKey())
        //return list
      } else {
        return list
      }
    }
    list
  }
}

2.ots读取的集合转换为spark dataframe

def otsToDF(spark: SparkSession, tableName: String, rowkey: String, startPkValue: String, endPkValue: String): DataFrame = {
    // 调用段落1函数获取ots集合
    import spark.implicits._
    val rowList: ListBuffer[com.alicloud.openservices.tablestore.model.Row] = OtsReadUtils().otsRead(tableName, rowkey, startPkValue, endPkValue)
    // 打印原始数据
    if (null != rowList && rowList.length > 10) {
      rowList.take(10).foreach(r => {
        val columns = r.getColumns
        println(columns.map(column => column.getName + ":" + column.getValue.asString()).mkString("--"))
      })
    }

    import org.apache.spark.sql.types._
    import spark.implicits._
    val schema2 = StructType(List(
      StructField("ACTION_TIME", StringType, nullable = true),
      StructField("ACTION_TYPE", StringType, nullable = true),
      StructField("CITY_NAME", StringType, nullable = true)
    )
    )

    val rdd2: RDD[Row] = spark.sparkContext.parallelize(
      rowList.map(r => {
        org.apache.spark.sql.Row(
          columnToString(r.getColumn("ACTION_TIME")),
          columnToString(r.getColumn("ACTION_TYPE")),
          columnToString(r.getColumn("CITY_NAME"))
        )
      })
    )

    val df2 = spark.createDataFrame(rdd2, schema2)

    df2.show(10, false)

    println("[certus.ots.mid]================== get ots:" + tableName + df2.count().toString)

    df2
  }

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值