1.scala读取tablestore数据
起始主键和结束主键均需构建
primaryKeyBuilder = PrimaryKeyBuilder.createPrimaryKeyBuilder()
否则会报错rowkey重复。
import com.alicloud.openservices.tablestore.SyncClient
import com.alicloud.openservices.tablestore.model._
import scala.collection.mutable.{ArrayBuffer, ListBuffer}
import scala.collection.JavaConversions._
case class OtsReadUtils() {
def otsRead(tableName: String, pkName: String, startPkValue: String, endPkValue: String): ListBuffer[Row] = {
val client = init
//getRow(client, "31", "id")
getRange(client, tableName, pkName, startPkValue, endPkValue)
}
def init: SyncClient = {
val endPoint = "http://dxl.com.cn"
val accessKeyId = "keyid"
val accessKeySecret = "333333333333"
val instanceName = "ssz"
new SyncClient(endPoint, accessKeyId, accessKeySecret, instanceName)
}
def getRow(client: SyncClient, pkValue: String, pkName: String): Unit = { //构造主键。
val primaryKeyBuilder = PrimaryKeyBuilder.createPrimaryKeyBuilder
primaryKeyBuilder.addPrimaryKeyColumn(pkName, PrimaryKeyValue.fromString(pkValue))
val primaryKey = primaryKeyBuilder.build
//读取⼀⾏数据。
val criteria = new SingleRowQueryCriteria("test03", primaryKey) // table name
//设置读取最新版本。
criteria.setMaxVersions(1)
var getRowResponse = client.getRow(new GetRowRequest(criteria))
var row = getRowResponse.getRow
println("读取完毕,结果为: ")
//println(row)
//设置读取某些列。
criteria.addColumnsToGet("Col0") // column name
getRowResponse = client.getRow(new GetRowRequest(criteria))
row = getRowResponse.getRow
println("读取完毕,结果为:")
//println(row)
}
/**
* FF yyyyMMdd device_id hhmiss
*
* @param client
* @param startPkValue
* @param endPkValue
*/
def getRange(client: SyncClient, tableName: String, pkName: String, startPkValue: String, endPkValue: String): ListBuffer[Row] = {
val rangeRowQueryCriteria: RangeRowQueryCriteria = new RangeRowQueryCriteria(tableName)
//设置起始主键。
var primaryKeyBuilder: PrimaryKeyBuilder = PrimaryKeyBuilder.createPrimaryKeyBuilder()
primaryKeyBuilder.addPrimaryKeyColumn(pkName, PrimaryKeyValue.fromString(startPkValue))
rangeRowQueryCriteria.setInclusiveStartPrimaryKey(primaryKeyBuilder.build())
//设置结束主键。
primaryKeyBuilder = PrimaryKeyBuilder.createPrimaryKeyBuilder()
primaryKeyBuilder.addPrimaryKeyColumn(pkName, PrimaryKeyValue.fromString(endPkValue));
rangeRowQueryCriteria.setExclusiveEndPrimaryKey(primaryKeyBuilder.build());
rangeRowQueryCriteria.setMaxVersions(1);
println("GetRange的结果为:");
var list = ListBuffer[Row]()
while (true) {
val getRangeResponse: GetRangeResponse = client.getRange(new GetRangeRequest(rangeRowQueryCriteria));
list = list ++ getRangeResponse.getRows()
//list.foreach(row => println(row))
//如果nextStartPrimaryKey不为null,则继续读取。
if (getRangeResponse.getNextStartPrimaryKey() != null) {
rangeRowQueryCriteria.setInclusiveStartPrimaryKey(getRangeResponse.getNextStartPrimaryKey())
//return list
} else {
return list
}
}
list
}
}
2.ots读取的集合转换为spark dataframe
def otsToDF(spark: SparkSession, tableName: String, rowkey: String, startPkValue: String, endPkValue: String): DataFrame = {
// 调用段落1函数获取ots集合
import spark.implicits._
val rowList: ListBuffer[com.alicloud.openservices.tablestore.model.Row] = OtsReadUtils().otsRead(tableName, rowkey, startPkValue, endPkValue)
// 打印原始数据
if (null != rowList && rowList.length > 10) {
rowList.take(10).foreach(r => {
val columns = r.getColumns
println(columns.map(column => column.getName + ":" + column.getValue.asString()).mkString("--"))
})
}
import org.apache.spark.sql.types._
import spark.implicits._
val schema2 = StructType(List(
StructField("ACTION_TIME", StringType, nullable = true),
StructField("ACTION_TYPE", StringType, nullable = true),
StructField("CITY_NAME", StringType, nullable = true)
)
)
val rdd2: RDD[Row] = spark.sparkContext.parallelize(
rowList.map(r => {
org.apache.spark.sql.Row(
columnToString(r.getColumn("ACTION_TIME")),
columnToString(r.getColumn("ACTION_TYPE")),
columnToString(r.getColumn("CITY_NAME"))
)
})
)
val df2 = spark.createDataFrame(rdd2, schema2)
df2.show(10, false)
println("[certus.ots.mid]================== get ots:" + tableName + df2.count().toString)
df2
}