//Scala随机数据生成
import java.util
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.types._
import org.apache.spark.sql.{DataFrame, Row, RowFactory, SparkSession}
import org.apache.spark.{SparkConf, SparkContext}
import scala.collection.mutable.{ArrayBuffer, ListBuffer}
import com.alibaba.fastjson.{JSON, JSONArray, JSONObject}
import org.apache.avro.generic.GenericData
import scala.util.Random
object Test {
def main(args: Array[String]): Unit = {
//环境配置
val sparkConf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("SparkStream")
val spark: SparkSession = SparkSession.builder().config(sparkConf).getOrCreate()
spark.sqlContext.sparkContext.setLogLevel("WARN")//打印日志的级别
//生成数据条数
val datasize = "10"
//接收到的Json格式的数据生成规则
val colConfig = "[{\"coltype\":\"enum\",\"colname\":\"a\",\"enumRule\":\"a,c,d,f,g\"},{\"coltype\":\"num\",\"colname\":\"b\",\"min\":1,\"max\":10,\"floatNum\":2,\"numLength\":3,\"enumRule\":\"\"},{\"coltype\":\"num\",\"colname\":\"c\",\"min\":1,\"max\":10,\"floatNum\":2,\"numLength\":3,\"enumRule\":\"\"}]"
//接收Json中的数据列名及类型
val fields = new ArrayBuffer[StructField]
val sONArray: JSONArray = JSON.parseArray(colConfig)
for (i <- 0 to sONArray.size() - 1) {
val colConfig: JSONObject = sONArray.getJSONObject(i)
val coltype: String = colConfig.getString("coltype")
val colname: String = colConfig.getString("colname")
if ("enum".equals(coltype)) {
fields.append(StructField(colname,StringType))
} else if("num".equals(coltype)){
fields.append(StructField(colname,StringType))
}else if("String".equals(coltype)){
fields.append(StructField(colname,StringType))
}else if("Int".equals(coltype)){
fields.append(StructField(colname,IntegerType))
}else if ("Double".equals(coltype)) {
fields.append(StructField(colname, DoubleType))
}else if ("Float".equals(coltype)){
fields.append(StructField(colname, FloatType))
}else if ("Date".equals(coltype)){
fields.append(StructField(colname, DateType))
}else{
fields.append(StructField(colname, StringType))
}
}
//存储字段元数据信息
val schema: StructType = StructType(fields)
//存储生成的随机数据
val dataList = new ListBuffer[Any]
/**
* 生成随机数据
*/
var datasizeInt =Integer.parseInt(datasize)
for (rowIndex <- 0 to datasizeInt - 1) {
val array = new util.ArrayList[String]
val colConfigArr: JSONArray = JSON.parseArray(colConfig)
for (i <- 0 to colConfigArr.size() - 1) {
val colConfig: JSONObject = colConfigArr.getJSONObject(i)
val coltype: String = colConfig.getString("coltype")
val colname: String = colConfig.getString("cplname")
if ("enum".equals(coltype)) {
val enumRules = colConfig.getString("enumRule").split(",")
array.add(enumRules(getIntRandom(0, enumRules.length)))
} else if ("num".equals(coltype)) {
val doubleVal = getDoubleRandom(
colConfig.getDoubleValue("min"),
colConfig.getDoubleValue("max"),
colConfig.getInteger("floatNum")
)
array.add(doubleVal.toString)
}
}
// println(array.toArray.mkString.split(","),"array")
// dataList.append(RowFactory.create(array.toArray.mkString.split(",")))
dataList.append(array)
}
// dataList.foreach(println)
// val array: Array[Any] = dataList.toArray
//数据转rdd
val rdd: RDD[Any] = spark.sparkContext.parallelize(dataList)
//去除数组[ ]
val value: RDD[Array[String]] = rdd.map { x =>
x.toString.replace("]", "").replace("[", "").split(",")
}
import spark.implicits._
//获取Row对象
val row: RDD[Row] = value.map(r=>RowFactory.create(r:_*))
//生成二维表
val frame: DataFrame = spark.createDataFrame(row,schema)
//存储到hive
frame.createOrReplaceTempView("temp")
//插入hive
spark.sql(”drop table if exists datasource.test“)
spark.sql(”create table datasource.test as select * from temp“)
spark.sql("alter table datasource.test set serdeproperties('serialization.null.format'='')")
}
def getIntRandom(startInt: Integer, endInt: Integer) = new Random().nextInt(endInt - startInt) + startInt
def getDoubleRandom(startInt: Double, endInt: Double, decimalLength: Integer) = {
var newStartInt = startInt * Math.pow(10, decimalLength * 1)
var newEndInt = endInt * Math.pow(10, decimalLength * 1)
getIntRandom(newStartInt.intValue(), newEndInt.intValue()) / Math.pow(10, decimalLength * 1)
}
}
scala,spark生成随机数据测试
最新推荐文章于 2024-07-10 17:19:01 发布