idea中 spark 工程项目单元测试_spark 单元测试-CSDN博客

本文链接：https://blog.csdn.net/weixin_39732131/article/details/121282865

本文介绍了在大数据项目中，由于数据规模庞大，无法直接使用实际数据进行代码验证。为确保代码功能正确性，作者通过单元测试来创建验证用例。示例代码展示了如何使用`assert`语句对数据过滤和无效值填充功能进行测试，确保数据正确过滤和填充。测试用例包括根据目标列过滤无效数据记录和整体无效值填充，并通过比较预期结果和实际结果来判断测试是否通过。

摘要由CSDN通过智能技术生成

场景：大数据项目数据规模大，验证代码不可能拿实际项目数据跑，因此为了验证代码功能正确性，就可以利用造数或简单案例数据编写验证代码即单元测试。

1、导入包

import org.junit.Test

2、@Test

3、assert语句

assert(condition) or

assert(condition,StringMessage)

4、示例代码

class InvalidHandlerTest extends TestBase {
/**
   * 测试场景：根据目标列过滤掉无效数据记录
   * 测试预期：实现数据正确过滤，测试用例通过
   */


  @Test
  def testFilterLabel() = {

    val schema = StructType(Array(
      StructField("id", StringType, true),
      StructField("age", DoubleType, true),
      StructField("gender", StringType, true),
      StructField("label", StringType, true)
    ))

    val rdd = spark.sparkContext.parallelize(Seq(
      Row.fromSeq(Seq("1", 20.0, "F", "c1")),
      Row.fromSeq(Seq("2", Double.NaN, "F", "c1")),
      Row.fromSeq(Seq("3", 19.0, "M", "null")),
      Row.fromSeq(Seq("4", 21.0, null, null))

    ))

    val df = spark.createDataFrame(rdd, schema)

    val filteredDf = InvalidHandler.filterData(df,"label")
    assert(filteredDf.count()==2)//count 只管记录行数，不管是不是有效值，哪怕是全部定义成null，行数也是一行
  }


  /**
   * 测试场景：整体无效值填充功能有效
   * 测试预期：实现数据正确过滤，测试用例通过
   */

  @Test
  def testFillNa() = {

    val schema = StructType(Array(
      StructField("id", StringType, true),
      StructField("age", DoubleType, true),
      StructField("gender", StringType, true),
      StructField("label", StringType, true)
    ))

    val rdd = spark.sparkContext.parallelize(Seq(
      Row.fromSeq(Seq("1", null, "nan", "c1")),
      Row.fromSeq(Seq("2", Double.NaN, "F", "c1")),
      Row.fromSeq(Seq("3", 19.0, "M", "null")),
      Row.fromSeq(Seq("4", 21.0, null, null)),
      Row.fromSeq(Seq("5", 20.0, null, "c2"))
    ))

    val df = spark.createDataFrame(rdd, schema)
    val fillFunc = new InvalidHandler(Map.empty)
    val(filledData,filledMap)= fillFunc.fit(df,"label","id")
    val fData = filledData.sort("id").rdd.map(t=>t.toSeq.mkString(",")).collect()//将一个dataframe转换成Array[String]类型
    val fMap = filledMap.toArray.map(t =>(t._1.concat("@").concat(t._2.toString))).mkString(",")//将Map转换成String类型
    val filledDataBase = Array("1,20.0,unknown,c1","2,20.0,F,c1","5,20.0,unknown,c2")
    val filledMapBase = "age@".concat("20.0,").concat("gender@unknown")

    assert(fData.sameElements(filledDataBase) && fMap==filledMapBase)//两个array判断相等，用sameElements
  }
}