场景:大数据项目数据规模大,验证代码不可能拿实际项目数据跑,因此为了验证代码功能正确性,就可以利用造数或简单案例数据编写验证代码即单元测试。
1、导入包
import org.junit.Test
2、@Test
3、assert语句
assert(condition) or
assert(condition,StringMessage)
4、示例代码
class InvalidHandlerTest extends TestBase {
/**
* 测试场景:根据目标列过滤掉无效数据记录
* 测试预期:实现数据正确过滤,测试用例通过
*/
@Test
def testFilterLabel() = {
val schema = StructType(Array(
StructField("id", StringType, true),
StructField("age", DoubleType, true),
StructField("gender", StringType, true),
StructField("label", StringType, true)
))
val rdd = spark.sparkContext.parallelize(Seq(
Row.fromSeq(Seq("1", 20.0, "F", "c1")),
Row.fromSeq(Seq("2", Double.NaN, "F", "c1")),
Row.fromSeq(Seq("3", 19.0, "M", "null")),
Row.fromSeq(Seq("4", 21.0, null, null))
))
val df = spark.createDataFrame(rdd, schema)
val filteredDf = InvalidHandler.filterData(df,"label")
assert(filteredDf.count()==2)//count 只管记录行数,不管是不是有效值,哪怕是全部定义成null,行数也是一行
}
/**
* 测试场景:整体无效值填充功能有效
* 测试预期:实现数据正确过滤,测试用例通过
*/
@Test
def testFillNa() = {
val schema = StructType(Array(
StructField("id", StringType, true),
StructField("age", DoubleType, true),
StructField("gender", StringType, true),
StructField("label", StringType, true)
))
val rdd = spark.sparkContext.parallelize(Seq(
Row.fromSeq(Seq("1", null, "nan", "c1")),
Row.fromSeq(Seq("2", Double.NaN, "F", "c1")),
Row.fromSeq(Seq("3", 19.0, "M", "null")),
Row.fromSeq(Seq("4", 21.0, null, null)),
Row.fromSeq(Seq("5", 20.0, null, "c2"))
))
val df = spark.createDataFrame(rdd, schema)
val fillFunc = new InvalidHandler(Map.empty)
val(filledData,filledMap)= fillFunc.fit(df,"label","id")
val fData = filledData.sort("id").rdd.map(t=>t.toSeq.mkString(",")).collect()//将一个dataframe转换成Array[String]类型
val fMap = filledMap.toArray.map(t =>(t._1.concat("@").concat(t._2.toString))).mkString(",")//将Map转换成String类型
val filledDataBase = Array("1,20.0,unknown,c1","2,20.0,F,c1","5,20.0,unknown,c2")
val filledMapBase = "age@".concat("20.0,").concat("gender@unknown")
assert(fData.sameElements(filledDataBase) && fMap==filledMapBase)//两个array判断相等,用sameElements
}
}