From dac54eecb7e9ed5b076ecd84f86d725a9690a2b5 Mon Sep 17 00:00:00 2001 From: Will Girten Date: Mon, 26 Jul 2021 11:03:43 -0400 Subject: [PATCH 01/10] Update Validator tests with API changes. --- .../labs/validation/ValidatorTestSuite.scala | 635 +++++++++++------- 1 file changed, 394 insertions(+), 241 deletions(-) diff --git a/src/test/scala/com/databricks/labs/validation/ValidatorTestSuite.scala b/src/test/scala/com/databricks/labs/validation/ValidatorTestSuite.scala index 6003887..fe54a77 100644 --- a/src/test/scala/com/databricks/labs/validation/ValidatorTestSuite.scala +++ b/src/test/scala/com/databricks/labs/validation/ValidatorTestSuite.scala @@ -3,278 +3,431 @@ package com.databricks.labs.validation import com.databricks.labs.validation.utils.Structures.{Bounds, MinMaxRuleDef} import org.apache.spark.sql.functions.{col, min} import org.scalatest.funsuite.AnyFunSuite -import org.apache.spark.sql.functions._ -case class ValidationValue(validDateTime: java.lang.Long, validNumerics: Array[Double], bounds: Array[Double], validStrings: Array[String]) +case class ValidationValue(ruleName: String, passed: Boolean, permitted: String, actual: String) class ValidatorTestSuite extends AnyFunSuite with SparkSessionFixture { import spark.implicits._ + spark.sparkContext.setLogLevel("ERROR") -// -// test("The input dataframe should have no rule failures on MinMaxRule") { -// val expectedDF = Seq( -// ("MinMax_Cost_Generated_max","bounds",ValidationValue(null,null,Array(0.0, 12.0),null),0,false), -// ("MinMax_Cost_Generated_min","bounds",ValidationValue(null,null,Array(0.0, 12.0),null),0,false), -// ("MinMax_Cost_manual_max","bounds",ValidationValue(null,null,Array(0.0, 12.0),null),0,false), -// ("MinMax_Cost_manual_min","bounds",ValidationValue(null,null,Array(0.0, 12.0),null),0,false), -// ("MinMax_Cost_max","bounds",ValidationValue(null,null,Array(0.0, 12.0),null),0,false), -// ("MinMax_Cost_min","bounds",ValidationValue(null,null,Array(0.0, 12.0),null),0,false), -// ("MinMax_Scan_Price_max","bounds",ValidationValue(null,null,Array(0.0, 29.99),null),0,false), -// ("MinMax_Scan_Price_min","bounds",ValidationValue(null,null,Array(0.0, 29.99),null),0,false), -// ("MinMax_Sku_Price_max","bounds",ValidationValue(null,null,Array(0.0, 29.99),null),0,false), -// ("MinMax_Sku_Price_min","bounds",ValidationValue(null,null,Array(0.0, 29.99),null),0,false) -// ).toDF("Rule_Name","Rule_Type","Validation_Values","Invalid_Count","Failed") -// val data = Seq() -// // 2 per rule so 2 MinMax_Sku_Price + 2 MinMax_Scan_Price + 2 MinMax_Cost + 2 MinMax_Cost_Generated -// // + 2 MinMax_Cost_manual = 10 rules -// val testDF = Seq( -// (1, 2, 3), -// (4, 5, 6), -// (7, 8, 9) -// ).toDF("retail_price", "scan_price", "cost") -// val minMaxPriceDefs = Array( -// MinMaxRuleDef("MinMax_Sku_Price", col("retail_price"), Bounds(0.0, 29.99)), -// MinMaxRuleDef("MinMax_Scan_Price", col("scan_price"), Bounds(0.0, 29.99)), -// MinMaxRuleDef("MinMax_Cost", col("cost"), Bounds(0.0, 12.0)) -// ) -// -// // Generate the array of Rules from the minmax generator -// val rulesArray = RuleSet.generateMinMaxRules(MinMaxRuleDef("MinMax_Cost_Generated", col("cost"), Bounds(0.0, 12.0))) -// -// val someRuleSet = RuleSet(testDF) -// someRuleSet.addMinMaxRules(minMaxPriceDefs: _*) -// someRuleSet.addMinMaxRules("MinMax_Cost_manual", col("cost"), Bounds(0.0,12.0)) -// someRuleSet.add(rulesArray) -// val (rulesReport, passed) = someRuleSet.validate() -// assert(rulesReport.except(expectedDF).count() == 0) -// assert(passed) -// assert(rulesReport.count() == 10) -// } -// -// test("The input rule should have 1 invalid count for MinMax_Scan_Price_Minus_Retail_Price_min and max for failing complex type.") { -// val expectedDF = Seq( -// ("MinMax_Retail_Price_Minus_Scan_Price_max","bounds",ValidationValue(null,null,Array(0.0, 29.99),null),1,true), -// ("MinMax_Retail_Price_Minus_Scan_Price_min","bounds",ValidationValue(null,null,Array(0.0, 29.99),null),1,true), -// ("MinMax_Scan_Price_Minus_Retail_Price_max","bounds",ValidationValue(null,null,Array(0.0, 29.99),null),0,false), -// ("MinMax_Scan_Price_Minus_Retail_Price_min","bounds",ValidationValue(null,null,Array(0.0, 29.99),null),0,false) -// ).toDF("Rule_Name","Rule_Type","Validation_Values","Invalid_Count","Failed") -// -// val testDF = Seq( -// (1, 2, 3), -// (4, 5, 6), -// (7, 8, 9) -// ).toDF("retail_price", "scan_price", "cost") -// val minMaxPriceDefs = Array( -// MinMaxRuleDef("MinMax_Retail_Price_Minus_Scan_Price", col("retail_price")-col("scan_price"), Bounds(0.0, 29.99)), -// MinMaxRuleDef("MinMax_Scan_Price_Minus_Retail_Price", col("scan_price")-col("retail_price"), Bounds(0.0, 29.99)) -// ) -// -// // Generate the array of Rules from the minmax generator -// val someRuleSet = RuleSet(testDF) -// someRuleSet.addMinMaxRules(minMaxPriceDefs: _*) -// val (rulesReport, passed) = someRuleSet.validate() -// assert(rulesReport.except(expectedDF).count() == 0, "Expected df is not equal to the returned rules report.") -// assert(!passed) -// assert(rulesReport.count() == 4) -// } -// -// test("The input rule should have 3 invalid count for failing aggregate type.") { -// val expectedDF = Seq( -// ("MinMax_Min_Retail_Price","bounds",ValidationValue(null,null,Array(0.0, 29.99),null),0,false), -// ("MinMax_Min_Scan_Price","bounds",ValidationValue(null,null,Array(3.0, 29.99),null),1,true) -// ).toDF("Rule_Name","Rule_Type","Validation_Values","Invalid_Count","Failed") -// val testDF = Seq( -// (1, 2, 3), -// (4, 5, 6), -// (7, 8, 9) -// ).toDF("retail_price", "scan_price", "cost") -// val minMaxPriceDefs = Seq( -// Rule("MinMax_Min_Retail_Price", min("retail_price"), Bounds(0.0, 29.99)), -// Rule("MinMax_Min_Scan_Price", min("scan_price"), Bounds(3.0, 29.99)) -// ) -// -// -// // Generate the array of Rules from the minmax generator -// val someRuleSet = RuleSet(testDF) -// someRuleSet.add(minMaxPriceDefs) -// val (rulesReport, passed) = someRuleSet.validate() -// assert(rulesReport.except(expectedDF).count() == 0, "Expected df is not equal to the returned rules report.") -// assert(!passed) -// assert(rulesReport.count() == 2) -// } -// -// test("The input dataframe should have exactly 1 rule failure on MinMaxRule") { -// val expectedDF = Seq( -// ("MinMax_Cost_max","bounds",ValidationValue(null,null,Array(0.0, 12.00),null),1,true), -// ("MinMax_Cost_min","bounds",ValidationValue(null,null,Array(0.0, 12.00),null),0,false), -// ("MinMax_Scan_Price_max","bounds",ValidationValue(null,null,Array(0.0, 29.99),null),0,false), -// ("MinMax_Scan_Price_min","bounds",ValidationValue(null,null,Array(0.0, 29.99),null),0,false), -// ("MinMax_Sku_Price_max","bounds",ValidationValue(null,null,Array(0.0, 29.99),null),0,false), -// ("MinMax_Sku_Price_min","bounds",ValidationValue(null,null,Array(0.0, 29.99),null),0,false) -// ).toDF("Rule_Name","Rule_Type","Validation_Values","Invalid_Count","Failed") -// val testDF = Seq( -// (1, 2, 3), -// (4, 5, 6), -// (7, 8, 99) -// ).toDF("retail_price", "scan_price", "cost") -// val minMaxPriceDefs = Array( -// MinMaxRuleDef("MinMax_Sku_Price", col("retail_price"), Bounds(0.0, 29.99)), -// MinMaxRuleDef("MinMax_Scan_Price", col("scan_price"), Bounds(0.0, 29.99)), -// MinMaxRuleDef("MinMax_Cost", col("cost"), Bounds(0.0, 12.0)) -// ) -// // Generate the array of Rules from the minmax generator -// -// val someRuleSet = RuleSet(testDF) -// someRuleSet.addMinMaxRules(minMaxPriceDefs: _*) -// val (rulesReport, passed) = someRuleSet.validate() -// val failedResults = rulesReport.filter(rulesReport("Invalid_Count") > 0).collect() -// assert(failedResults.length == 1) -// assert(rulesReport.except(expectedDF).count() == 0, "Expected df is not equal to the returned rules report.") -// assert(failedResults(0)(0) == "MinMax_Cost_max") -// assert(!passed) -// } -// -// test("The DF in the rulesset object is the same as the input test df") { -// val testDF = Seq( -// (1, 2, 3), -// (4, 5, 6), -// (7, 8, 99) -// ).toDF("retail_price", "scan_price", "cost") -// val minMaxPriceDefs = Array( -// MinMaxRuleDef("MinMax_Sku_Price", col("retail_price"), Bounds(0.0, 29.99)), -// MinMaxRuleDef("MinMax_Scan_Price", col("scan_price"), Bounds(0.0, 29.99)), -// MinMaxRuleDef("MinMax_Cost", col("cost"), Bounds(0.0, 12.0)) -// ) -// // Generate the array of Rules from the minmax generator -// -// val someRuleSet = RuleSet(testDF) -// someRuleSet.addMinMaxRules(minMaxPriceDefs: _*) -// val rulesDf = someRuleSet.getDf -// assert(testDF.except(rulesDf).count() == 0) -// } -// -// test("The group by columns are the correct group by clauses in the validation") { -// val expectedDF = Seq( -// (3,"MinMax_Scan_Price_max","bounds",ValidationValue(null,null,Array(0.0, 29.99),null),0,false), -// (6,"MinMax_Scan_Price_max","bounds",ValidationValue(null,null,Array(0.0, 29.99),null),0,false), -// (3,"MinMax_Scan_Price_min","bounds",ValidationValue(null,null,Array(0.0, 29.99),null),0,false), -// (6,"MinMax_Scan_Price_min","bounds",ValidationValue(null,null,Array(0.0, 29.99),null),0,false), -// (3,"MinMax_Sku_Price_max","bounds",ValidationValue(null,null,Array(0.0, 29.99),null),0,false), -// (6,"MinMax_Sku_Price_max","bounds",ValidationValue(null,null,Array(0.0, 29.99),null),0,false), -// (3,"MinMax_Sku_Price_min","bounds",ValidationValue(null,null,Array(0.0, 29.99),null),0,false), -// (6,"MinMax_Sku_Price_min","bounds",ValidationValue(null,null,Array(0.0, 29.99),null),0,false) -// ).toDF("cost","Rule_Name","Rule_Type","Validation_Values","Invalid_Count","Failed") -// // 2 groups so count of the rules should yield (2 minmax rules * 2 columns) * 2 groups in cost (8 rows) -// val testDF = Seq( -// (1, 2, 3), -// (4, 5, 6), -// (7, 8, 3) -// ).toDF("retail_price", "scan_price", "cost") -// val minMaxPriceDefs = Array( -// MinMaxRuleDef("MinMax_Sku_Price", col("retail_price"), Bounds(0.0, 29.99)), -// MinMaxRuleDef("MinMax_Scan_Price", col("scan_price"), Bounds(0.0, 29.99)) -// ) -// -// val someRuleSet = RuleSet(testDF, "cost") -// someRuleSet.addMinMaxRules(minMaxPriceDefs: _*) -// val groupBys = someRuleSet.getGroupBys -// val (groupByValidated, passed) = someRuleSet.validate() -// -// assert(groupBys.length == 1) -// assert(groupBys.head == "cost") -// assert(someRuleSet.isGrouped) -// assert(passed) -// assert(groupByValidated.count() == 8) -// assert(groupByValidated.except(expectedDF).count() == 0, "Expected df is not equal to the returned rules report.") -// assert(groupByValidated.filter(groupByValidated("Invalid_Count") > 0).count() == 0) -// assert(groupByValidated.filter(groupByValidated("Failed") === true).count() == 0) -// } -// -// test("The group by columns are with rules failing the validation") { -// val expectedDF = Seq( -// (3,"MinMax_Sku_Price_max","bounds",ValidationValue(null,null,Array(0.0, 0.0),null),1,true), -// (6,"MinMax_Sku_Price_max","bounds",ValidationValue(null,null,Array(0.0, 0.0),null),1,true), -// (3,"MinMax_Sku_Price_min","bounds",ValidationValue(null,null,Array(0.0, 0.0),null),1,true), -// (6,"MinMax_Sku_Price_min","bounds",ValidationValue(null,null,Array(0.0, 0.0),null),1,true), -// (3,"MinMax_Scan_Price_max","bounds",ValidationValue(null,null,Array(0.0, 29.99),null),0,false), -// (6,"MinMax_Scan_Price_max","bounds",ValidationValue(null,null,Array(0.0, 29.99),null),0,false), -// (3,"MinMax_Scan_Price_min","bounds",ValidationValue(null,null,Array(0.0, 29.99),null),0,false), -// (6,"MinMax_Scan_Price_min","bounds",ValidationValue(null,null,Array(0.0, 29.99),null),0,false) -// ).toDF("cost","Rule_Name","Rule_Type","Validation_Values","Invalid_Count","Failed") -// // 2 groups so count of the rules should yield (2 minmax rules * 2 columns) * 2 groups in cost (8 rows) -// val testDF = Seq( -// (1, 2, 3), -// (4, 5, 6), -// (7, 8, 3) -// ).toDF("retail_price", "scan_price", "cost") -// val minMaxPriceDefs = Array( -// MinMaxRuleDef("MinMax_Sku_Price", col("retail_price"), Bounds(0.0, 0.0)), -// MinMaxRuleDef("MinMax_Scan_Price", col("scan_price"), Bounds(0.0, 29.99)) -// ) -// -// val someRuleSet = RuleSet(testDF, "cost") -// someRuleSet.addMinMaxRules(minMaxPriceDefs: _*) -// val groupBys = someRuleSet.getGroupBys -// val (groupByValidated, passed) = someRuleSet.validate() -// -// assert(groupBys.length == 1, "Group by length is not 1") -// assert(groupBys.head == "cost", "Group by column is not cost") -// assert(someRuleSet.isGrouped) -// assert(!passed, "Rule set did not fail.") -// assert(groupByValidated.count() == 8, "Rule count should be 8") -// assert(groupByValidated.except(expectedDF).count() == 0, "Expected df is not equal to the returned rules report.") -// assert(groupByValidated.filter(groupByValidated("Invalid_Count") > 0).count() == 4, "Invalid count is not 4.") -// assert(groupByValidated.filter(groupByValidated("Failed") === true).count() == 4, "Failed count is not 4.") -// } -// - test("Validate list of values with numeric types, string types and long types.") { + test("The input dataframe should have no rule failures on MinMaxRule") { + // 2 per rule so 2 MinMax_Sku_Price + 2 MinMax_Scan_Price + 2 MinMax_Cost + 2 MinMax_Cost_Generated + // + 2 MinMax_Cost_manual = 10 rules + val testDF = Seq( + (1, 2, 3), + (4, 5, 6), + (7, 8, 9) + ).toDF("retail_price", "scan_price", "cost") + + val expectedColumns = testDF.columns ++ Seq("MinMax_Sku_Price_min", "MinMax_Sku_Price_max", "MinMax_Scan_Price_min", + "MinMax_Scan_Price_max", "MinMax_Cost_min", "MinMax_Cost_max", "MinMax_Cost_manual_min", "MinMax_Cost_manual_max", + "MinMax_Cost_Generated_min", "MinMax_Cost_Generated_max") + val expectedDF = Seq( + (1, 2, 3, + ValidationValue("MinMax_Sku_Price_min", passed = true, "[0.0, 29.99]", "1"), + ValidationValue("MinMax_Sku_Price_max", passed = true, "[0.0, 29.99]", "1"), + ValidationValue("MinMax_Scan_Price_min", passed = true, "[0.0, 29.99]", "2"), + ValidationValue("MinMax_Scan_Price_max", passed = true, "[0.0, 29.99]", "2"), + ValidationValue("MinMax_Cost_min", passed = true, "[0.0, 12.0]", "3"), + ValidationValue("MinMax_Cost_max", passed = true, "[0.0, 12.0]", "3"), + ValidationValue("MinMax_Cost_manual_min", passed = true, "[0.0, 12.0]", "3"), + ValidationValue("MinMax_Cost_manual_max", passed = true, "[0.0, 12.0]", "3"), + ValidationValue("MinMax_Cost_Generated_min", passed = true, "[0.0, 12.0]", "3"), + ValidationValue("MinMax_Cost_Generated_max", passed = true, "[0.0, 12.0]", "3") + ), + (4, 5, 6, + ValidationValue("MinMax_Sku_Price_min", passed = true, "[0.0, 29.99]", "4"), + ValidationValue("MinMax_Sku_Price_max", passed = true, "[0.0, 29.99]", "4"), + ValidationValue("MinMax_Scan_Price_min", passed = true, "[0.0, 29.99]", "5"), + ValidationValue("MinMax_Scan_Price_max", passed = true, "[0.0, 29.99]", "5"), + ValidationValue("MinMax_Cost_min", passed = true, "[0.0, 12.0]", "6"), + ValidationValue("MinMax_Cost_max", passed = true, "[0.0, 12.0]", "6"), + ValidationValue("MinMax_Cost_manual_min", passed = true, "[0.0, 12.0]", "6"), + ValidationValue("MinMax_Cost_manual_max", passed = true, "[0.0, 12.0]", "6"), + ValidationValue("MinMax_Cost_Generated_min", passed = true, "[0.0, 12.0]", "6"), + ValidationValue("MinMax_Cost_Generated_max", passed = true, "[0.0, 12.0]", "6") + ), + (7, 8, 9, + ValidationValue("MinMax_Sku_Price_min", passed = true, "[0.0, 29.99]", "7"), + ValidationValue("MinMax_Sku_Price_max", passed = true, "[0.0, 29.99]", "7"), + ValidationValue("MinMax_Scan_Price_min", passed = true, "[0.0, 29.99]", "8"), + ValidationValue("MinMax_Scan_Price_max", passed = true, "[0.0, 29.99]", "8"), + ValidationValue("MinMax_Cost_min", passed = true, "[0.0, 12.0]", "9"), + ValidationValue("MinMax_Cost_max", passed = true, "[0.0, 12.0]", "9"), + ValidationValue("MinMax_Cost_manual_min", passed = true, "[0.0, 12.0]", "9"), + ValidationValue("MinMax_Cost_manual_max", passed = true, "[0.0, 12.0]", "9"), + ValidationValue("MinMax_Cost_Generated_min", passed = true, "[0.0, 12.0]", "9"), + ValidationValue("MinMax_Cost_Generated_max", passed = true, "[0.0, 12.0]", "9") + ) + ).toDF(expectedColumns: _*) + + // Create an Array of MinMax Rules + val minMaxPriceDefs = Array( + MinMaxRuleDef("MinMax_Sku_Price", col("retail_price"), Bounds(0.0, 29.99)), + MinMaxRuleDef("MinMax_Scan_Price", col("scan_price"), Bounds(0.0, 29.99)), + MinMaxRuleDef("MinMax_Cost", col("cost"), Bounds(0.0, 12.0)) + ) + + // Generate the array of Rules from the minmax generator + val rulesArray = RuleSet.generateMinMaxRules(MinMaxRuleDef("MinMax_Cost_Generated", col("cost"), Bounds(0.0, 12.0))) + val someRuleSet = RuleSet(testDF) + someRuleSet.addMinMaxRules(minMaxPriceDefs: _*) + + // Manually add a Rule + someRuleSet.addMinMaxRules("MinMax_Cost_manual", col("cost"), Bounds(0.0, 12.0)) + someRuleSet.add(rulesArray) + val validationResults = someRuleSet.validate() + + // Ensure that validate report is expected + assert(validationResults.completeReport.exceptAll(expectedDF).count() == 0, "Expected df is not equal to the returned rules report.") + + // Ensure that there are 2 Rules per MinMax Rule added as separate columns + assert(validationResults.completeReport.count() == 3) + assert((validationResults.completeReport.columns diff testDF.columns).length == 10) + + // Ensure that all Rules passed;there should be no failed Rules + assert(validationResults.summaryReport.count() == 0) + } + + test("The input rule should have 3 invalid count for MinMax_Scan_Price_Minus_Retail_Price_min and max for failing complex type.") { + val testDF = Seq( + (1, 2, 3), + (4, 5, 6), + (7, 8, 9) + ).toDF("retail_price", "scan_price", "cost") + val expectedColumns = testDF.columns ++ Seq("MinMax_Retail_Price_Minus_Scan_Price_min", "MinMax_Retail_Price_Minus_Scan_Price_max", + "MinMax_Scan_Price_Minus_Retail_Price_min", "MinMax_Scan_Price_Minus_Retail_Price_max") + val expectedDF = Seq( + (1, 2, 3, + ValidationValue("MinMax_Retail_Price_Minus_Scan_Price_min", passed = false, "[0.0, 29.99]", "-1"), + ValidationValue("MinMax_Retail_Price_Minus_Scan_Price_max", passed = false, "[0.0, 29.99]", "-1"), + ValidationValue("MinMax_Scan_Price_Minus_Retail_Price_min", passed = true, "[0.0, 29.99]", "1"), + ValidationValue("MinMax_Scan_Price_Minus_Retail_Price_max", passed = true, "[0.0, 29.99]", "1") + ), + (4, 5, 6, + ValidationValue("MinMax_Retail_Price_Minus_Scan_Price_min", passed = false, "[0.0, 29.99]", "-1"), + ValidationValue("MinMax_Retail_Price_Minus_Scan_Price_max", passed = false, "[0.0, 29.99]", "-1"), + ValidationValue("MinMax_Scan_Price_Minus_Retail_Price_min", passed = true, "[0.0, 29.99]", "1"), + ValidationValue("MinMax_Scan_Price_Minus_Retail_Price_max", passed = true, "[0.0, 29.99]", "1") + ), + (7, 8, 9, + ValidationValue("MinMax_Retail_Price_Minus_Scan_Price_min", passed = false, "[0.0, 29.99]", "-1"), + ValidationValue("MinMax_Retail_Price_Minus_Scan_Price_max", passed = false, "[0.0, 29.99]", "-1"), + ValidationValue("MinMax_Scan_Price_Minus_Retail_Price_min", passed = true, "[0.0, 29.99]", "1"), + ValidationValue("MinMax_Scan_Price_Minus_Retail_Price_max", passed = true, "[0.0, 29.99]", "1") + ) + ).toDF(expectedColumns: _*) + + val minMaxPriceDefs = Array( + MinMaxRuleDef("MinMax_Retail_Price_Minus_Scan_Price", col("retail_price") - col("scan_price"), Bounds(0.0, 29.99)), + MinMaxRuleDef("MinMax_Scan_Price_Minus_Retail_Price", col("scan_price") - col("retail_price"), Bounds(0.0, 29.99)) + ) + + // Generate the array of Rules from the minmax generator + val someRuleSet = RuleSet(testDF) + someRuleSet.addMinMaxRules(minMaxPriceDefs: _*) + val validationResults = someRuleSet.validate() + + // Ensure that validate report is expected + assert(validationResults.completeReport.exceptAll(expectedDF).count() == 0, "Expected df is not equal to the returned rules report.") + + // Ensure that there are failed rows in summary report + assert(validationResults.summaryReport.count() > 0) + assert(validationResults.summaryReport.count() == 3) + } + + test("The input rule should have 1 invalid count for failing aggregate type.") { + val testDF = Seq( + (1, 2, 3), + (4, 5, 6), + (7, 8, 9) + ).toDF("retail_price", "scan_price", "cost") + val expectedColumns = testDF.columns ++ Seq("MinMax_Min_Retail_Price", "MinMax_Min_Scan_Price") + val expectedDF = Seq( + (1, 2, 3, + ValidationValue("MinMax_Min_Retail_Price", passed = true, "[0.0, 29.99]", "1"), + ValidationValue("MinMax_Min_Scan_Price", passed = false, "[3.0, 29.99]", "2") + ), + (4, 5, 6, + ValidationValue("MinMax_Min_Retail_Price", passed = true, "[0.0, 29.99]", "4"), + ValidationValue("MinMax_Min_Scan_Price", passed = true, "[3.0, 29.99]", "5") + ), + (7, 8, 9, + ValidationValue("MinMax_Min_Retail_Price", passed = true, "[0.0, 29.99]", "7"), + ValidationValue("MinMax_Min_Scan_Price", passed = true, "[3.0, 29.99]", "8") + ) + ).toDF(expectedColumns: _*) + val minMaxPriceDefs = Seq( + Rule("MinMax_Min_Retail_Price", min("retail_price"), Bounds(0.0, 29.99)), + Rule("MinMax_Min_Scan_Price", min("scan_price"), Bounds(3.0, 29.99)) + ) + + // Generate the array of Rules from the minmax generator + val someRuleSet = RuleSet(testDF) + someRuleSet.add(minMaxPriceDefs) + val validationResults = someRuleSet.validate() + + // Ensure that validate report is expected + assert(validationResults.completeReport.exceptAll(expectedDF).count() == 0, "Expected df is not equal to the returned rules report.") + + // Ensure that there is a failed row + assert(validationResults.summaryReport.count() > 0) + assert(validationResults.summaryReport.count() == 1) + } + + test("The input dataframe should have exactly 1 rule failure on MinMaxRule") { + val testDF = Seq( + (1, 2, 3), + (4, 5, 6), + (7, 8, 99) + ).toDF("retail_price", "scan_price", "cost") + val expectedColumns = testDF.columns ++ Seq("MinMax_Sku_Price_min", "MinMax_Sku_Price_max", + "MinMax_Scan_Price_min", "MinMax_Scan_Price_max", "MinMax_Cost_min", "MinMax_Cost_max" + ) + val expectedDF = Seq( + (1, 2, 3, + ValidationValue("MinMax_Sku_Price_min", passed = true, "[0.0, 29.99]", "1"), + ValidationValue("MinMax_Sku_Price_max", passed = true, "[0.0, 29.99]", "1"), + ValidationValue("MinMax_Scan_Price_min", passed = true, "[0.0, 29.99]", "2"), + ValidationValue("MinMax_Scan_Price_max", passed = true, "[0.0, 29.99]", "2"), + ValidationValue("MinMax_Cost_min", passed = true, "[0.0, 12.0]", "3"), + ValidationValue("MinMax_Cost_max", passed = true, "[0.0, 12.0]", "3"), + ), + (4, 5, 6, + ValidationValue("MinMax_Sku_Price_min", passed = true, "[0.0, 29.99]", "4"), + ValidationValue("MinMax_Sku_Price_max", passed = true, "[0.0, 29.99]", "4"), + ValidationValue("MinMax_Scan_Price_min", passed = true, "[0.0, 29.99]", "5"), + ValidationValue("MinMax_Scan_Price_max", passed = true, "[0.0, 29.99]", "5"), + ValidationValue("MinMax_Cost_min", passed = true, "[0.0, 12.0]", "6"), + ValidationValue("MinMax_Cost_max", passed = true, "[0.0, 12.0]", "6"), + ), + (7, 8, 99, + ValidationValue("MinMax_Sku_Price_min", passed = true, "[0.0, 29.99]", "7"), + ValidationValue("MinMax_Sku_Price_max", passed = true, "[0.0, 29.99]", "7"), + ValidationValue("MinMax_Scan_Price_min", passed = true, "[0.0, 29.99]", "8"), + ValidationValue("MinMax_Scan_Price_max", passed = true, "[0.0, 29.99]", "8"), + ValidationValue("MinMax_Cost_min", passed = false, "[0.0, 12.0]", "99"), + ValidationValue("MinMax_Cost_max", passed = false, "[0.0, 12.0]", "99"), + ) + ).toDF(expectedColumns: _*) + + val minMaxPriceDefs = Array( + MinMaxRuleDef("MinMax_Sku_Price", col("retail_price"), Bounds(0.0, 29.99)), + MinMaxRuleDef("MinMax_Scan_Price", col("scan_price"), Bounds(0.0, 29.99)), + MinMaxRuleDef("MinMax_Cost", col("cost"), Bounds(0.0, 12.0)) + ) + + // Generate the array of Rules from the minmax generator + val someRuleSet = RuleSet(testDF) + someRuleSet.addMinMaxRules(minMaxPriceDefs: _*) + val validationResults = someRuleSet.validate() + + // Ensure that validate report is expected + assert(validationResults.completeReport.exceptAll(expectedDF).count() == 0, "Expected df is not equal to the returned rules report.") + + // Ensure that there is a failed row + assert(validationResults.summaryReport.count() > 0) + assert(validationResults.summaryReport.count() == 1) + + // Ensure that the the failed Rules are MinMax_Cost_min, MinMax_Cost_max + assert(validationResults.summaryReport.select("failed_rules.ruleName").as[Array[String]].collect()(0)(0) == "MinMax_Cost_min", "MinMax_Cost_max") + assert(validationResults.summaryReport.select("failed_rules.ruleName").as[Array[String]].collect()(0)(1) == "MinMax_Cost_max", "MinMax_Cost_max") + } + + test("The DF in the rulesset object is the same as the input test df") { + val testDF = Seq( + (1, 2, 3), + (4, 5, 6), + (7, 8, 99) + ).toDF("retail_price", "scan_price", "cost") + val minMaxPriceDefs = Array( + MinMaxRuleDef("MinMax_Sku_Price", col("retail_price"), Bounds(0.0, 29.99)), + MinMaxRuleDef("MinMax_Scan_Price", col("scan_price"), Bounds(0.0, 29.99)), + MinMaxRuleDef("MinMax_Cost", col("cost"), Bounds(0.0, 12.0)) + ) + // Generate the array of Rules from the minmax generator + val someRuleSet = RuleSet(testDF) + someRuleSet.addMinMaxRules(minMaxPriceDefs: _*) + val rulesDf = someRuleSet.getDf + assert(testDF.except(rulesDf).count() == 0) + } + + test("The group by columns are the correct group by clauses in the validation") { + // 2 groups so count of the rules should yield (2 minmax rules * 2 columns) * 2 groups in cost (8 rows) + val testDF = Seq( + (1, 2, 3), + (4, 5, 6), + (7, 8, 3) + ).toDF("retail_price", "scan_price", "cost") + val expectedColumns = Seq("cost", "MinMax_Sku_Price_min", "MinMax_Sku_Price_max", "MinMax_Scan_Price_min", "MinMax_Scan_Price_max") + val expectedDF = Seq( + (3, + ValidationValue("MinMax_Sku_Price_min", passed = true, "[0.0, 29.99]", "1"), + ValidationValue("MinMax_Sku_Price_max", passed = true, "[0.0, 29.99]", "7"), + ValidationValue("MinMax_Scan_Price_min", passed = true, "[0.0, 29.99]", "2"), + ValidationValue("MinMax_Scan_Price_max", passed = true, "[0.0, 29.99]", "8") + ), + (6, + ValidationValue("MinMax_Sku_Price_min", passed = true, "[0.0, 29.99]", "4"), + ValidationValue("MinMax_Sku_Price_max", passed = true, "[0.0, 29.99]", "4"), + ValidationValue("MinMax_Scan_Price_min", passed = true, "[0.0, 29.99]", "5"), + ValidationValue("MinMax_Scan_Price_max", passed = true, "[0.0, 29.99]", "5") + ) + ).toDF(expectedColumns: _*) + + val minMaxPriceDefs = Array( + MinMaxRuleDef("MinMax_Sku_Price", col("retail_price"), Bounds(0.0, 29.99)), + MinMaxRuleDef("MinMax_Scan_Price", col("scan_price"), Bounds(0.0, 29.99)) + ) + + val someRuleSet = RuleSet(testDF, "cost") + someRuleSet.addMinMaxRules(minMaxPriceDefs: _*) + val groupBys = someRuleSet.getGroupBys + val validationResults = someRuleSet.validate() + + // Ensure that input DF was grouped by "cost" column + assert(groupBys.length == 1) + assert(groupBys.head == "cost") + assert(someRuleSet.isGrouped) + + // Ensure that all rows passed + assert(validationResults.summaryReport.count() == 0) + + // Ensure that the complete report matches the expected output + assert(validationResults.completeReport.count() == 2) + assert(validationResults.completeReport.exceptAll(expectedDF).count() == 0, "Expected df is not equal to the returned rules report.") + } + + test("The group by columns are with rules failing the validation") { + // 2 groups so count of the rules should yield (2 minmax rules * 2 columns) * 2 groups in cost (8 rows) + val testDF = Seq( + (1, 2, 3), + (4, 5, 6), + (7, 8, 3) + ).toDF("retail_price", "scan_price", "cost") + val expectedColumns = Seq("cost", "MinMax_Sku_Price_min", "MinMax_Sku_Price_max", "MinMax_Scan_Price_min", "MinMax_Scan_Price_max") + val expectedDF = Seq( + (3, + ValidationValue("MinMax_Sku_Price_min", passed = false, "[0.0, 0.0]", "1"), + ValidationValue("MinMax_Sku_Price_max", passed = false, "[0.0, 0.0]", "7"), + ValidationValue("MinMax_Scan_Price_min", passed = true, "[0.0, 29.99]", "2"), + ValidationValue("MinMax_Scan_Price_max", passed = true, "[0.0, 29.99]", "8") + ), + (6, + ValidationValue("MinMax_Sku_Price_min", passed = false, "[0.0, 0.0]", "4"), + ValidationValue("MinMax_Sku_Price_max", passed = false, "[0.0, 0.0]", "4"), + ValidationValue("MinMax_Scan_Price_min", passed = true, "[0.0, 29.99]", "5"), + ValidationValue("MinMax_Scan_Price_max", passed = true, "[0.0, 29.99]", "5") + ) + ).toDF(expectedColumns: _*) + val minMaxPriceDefs = Array( + MinMaxRuleDef("MinMax_Sku_Price", col("retail_price"), Bounds(0.0, 0.0)), + MinMaxRuleDef("MinMax_Scan_Price", col("scan_price"), Bounds(0.0, 29.99)) + ) + + val someRuleSet = RuleSet(testDF, "cost") + someRuleSet.addMinMaxRules(minMaxPriceDefs: _*) + val groupBys = someRuleSet.getGroupBys + val validationResults = someRuleSet.validate() + + assert(groupBys.length == 1, "Group by length is not 1") + assert(groupBys.head == "cost", "Group by column is not cost") + assert(someRuleSet.isGrouped) + + // Ensure that there are failed rows + assert(validationResults.summaryReport.count() > 0, "Rule set did not fail.") + assert(validationResults.summaryReport.count() == 2, "Failed row count should be 2") + assert(validationResults.completeReport.count() == 2, "Row count should be 2") + + // Ensure that the complete report matches expected output + assert(validationResults.completeReport.exceptAll(expectedDF).count() == 0, "Expected df is not equal to the returned rules report.") + } + + test("Validate list of values with numeric types, string types and long types.") { val testDF = Seq( ("food_a", 2.51, 3, 111111111111111L), ("food_b", 5.11, 6, 211111111111111L), ("food_c", 8.22, 99, 311111111111111L) ).toDF("product_name", "scan_price", "cost", "id") + val expectedColumns = testDF.columns ++ Seq("CheckIfCostIsInLOV", "CheckIfScanPriceIsInLOV", "CheckIfIdIsInLOV") val numericLovExpectedDF = Seq( - ("CheckIfCostIsInLOV","validNumerics",ValidationValue(null,Array(3,6,99),null,null),0,false), - ("CheckIfScanPriceIsInLOV","validNumerics",ValidationValue(null,Array(2.51,5.11,8.22),null,null),0,false), - ("CheckIfIdIsInLOV","validNumerics",ValidationValue(null,Array(111111111111111L,211111111111111L,311111111111111L),null,null),0,false) - ).toDF("Rule_Name","Rule_Type","Validation_Values","Invalid_Count","Failed") + ("food_a", 2.51, 3, 111111111111111L, + ValidationValue("CheckIfCostIsInLOV", passed = true, "[3.0, 6.0, 99.0]", "3"), + ValidationValue("CheckIfScanPriceIsInLOV", passed = true, "[2.51, 5.11, 8.22]", "2.51"), + ValidationValue("CheckIfIdIsInLOV", passed = true, "[1.11111111111111E14, 2.11111111111111E14, 3.11111111111111E14]", "111111111111111") + ), + ("food_b", 5.11, 6, 211111111111111L, + ValidationValue("CheckIfCostIsInLOV", passed = true, "[3.0, 6.0, 99.0]", "6"), + ValidationValue("CheckIfScanPriceIsInLOV", passed = true, "[2.51, 5.11, 8.22]", "5.11"), + ValidationValue("CheckIfIdIsInLOV", passed = true, "[1.11111111111111E14, 2.11111111111111E14, 3.11111111111111E14]", "211111111111111") + ), + ("food_c", 8.22, 99, 311111111111111L, + ValidationValue("CheckIfCostIsInLOV", passed = true, "[3.0, 6.0, 99.0]", "99"), + ValidationValue("CheckIfScanPriceIsInLOV", passed = true, "[2.51, 5.11, 8.22]", "8.22"), + ValidationValue("CheckIfIdIsInLOV", passed = true, "[1.11111111111111E14, 2.11111111111111E14, 3.11111111111111E14]", "311111111111111") + ) + ).toDF(expectedColumns: _*) + val numericRules = Array( - Rule("CheckIfCostIsInLOV", col("cost"), Array(3,6,99)), - Rule("CheckIfScanPriceIsInLOV", col("scan_price"), Array(2.51,5.11,8.22)), - Rule("CheckIfIdIsInLOV", col("id"), Array(111111111111111L,211111111111111L,311111111111111L)) + Rule("CheckIfCostIsInLOV", col("cost"), Array(3, 6, 99)), + Rule("CheckIfScanPriceIsInLOV", col("scan_price"), Array(2.51, 5.11, 8.22)), + Rule("CheckIfIdIsInLOV", col("id"), Array(111111111111111L, 211111111111111L, 311111111111111L)) ) - // Generate the array of Rules from the minmax generator + // Generate the array of Rules from the minmax generator val numericRuleSet = RuleSet(testDF) numericRuleSet.add(numericRules) val numericValidationResults = numericRuleSet.validate() + + // Ensure that all ruleTypes are ValidateNumerics assert(numericRules.map(_.ruleType == RuleType.ValidateNumerics).reduce(_ && _), "Not every value is validate numerics.") - assert(numericRules.map(_.boundaries == null).reduce(_ && _), "Boundaries are not null.") + + // Ensure that there are infinite boundaries, by default + assert(numericRules.map(_.boundaries.lower == Double.NegativeInfinity).reduce(_ && _), "Lower boundaries are not negatively infinite.") + assert(numericRules.map(_.boundaries.upper == Double.PositiveInfinity).reduce(_ && _), "Upper boundaries are not positively infinite.") + + // Ensure that the complete report matches expected output + assert(numericValidationResults.completeReport.exceptAll(numericLovExpectedDF).count() == 0, "Expected numeric df is not equal to the returned rules report.") + + // Ensure that all rows passed the Rules assert(numericValidationResults.summaryReport.isEmpty) -// assert(numericValidated.except(numericLovExpectedDF).count() == 0, "Expected df is not equal to the returned rules report.") -// assert(numericValidated.filter(numericValidated("Invalid_Count") > 0).count() == 0) -// assert(numericValidated.filter(numericValidated("Failed") === true).count() == 0) - val stringRule = Rule("CheckIfProductNameInLOV", col("product_name"), Array("food_a","food_b","food_c")) - // Generate the array of Rules from the minmax generator + // Create a String List of Values Rule + val stringRule = Rule("CheckIfProductNameInLOV", col("product_name"), Array("food_a", "food_b", "food_c")) + val expectedStringLovColumns = testDF.columns ++ Seq("CheckIfProductNameInLOV") val stringLovExpectedDF = Seq( - ("CheckIfProductNameInLOV","validStrings",ValidationValue(null,null,null,Array("food_a", "food_b", "food_c")),0,false) - ).toDF("Rule_Name","Rule_Type","Validation_Values","Invalid_Count","Failed") + ("food_a", 2.51, 3, 111111111111111L, + ValidationValue("CheckIfProductNameInLOV", passed = true, "[food_a, food_b, food_c]", "food_a") + ), + ("food_b", 5.11, 6, 211111111111111L, + ValidationValue("CheckIfProductNameInLOV", passed = true, "[food_a, food_b, food_c]", "food_b") + ), + ("food_c", 8.22, 99, 311111111111111L, + ValidationValue("CheckIfProductNameInLOV", passed = true, "[food_a, food_b, food_c]", "food_c") + ) + ).toDF(expectedStringLovColumns: _*) + // Validate testDF against String LOV Rule val stringRuleSet = RuleSet(testDF) stringRuleSet.add(stringRule) val stringValidationResults = stringRuleSet.validate() + + // Ensure that the ruleType is set properly assert(stringRule.ruleType == RuleType.ValidateStrings) - assert(stringRule.boundaries == null) + + // Ensure that the complete report matches expected output + assert(stringValidationResults.completeReport.exceptAll(stringLovExpectedDF).count() == 0, "Expected String LOV df is not equal to the returned rules report.") + + // Ensure that there are infinite boundaries, by default + assert(stringRule.boundaries.lower == Double.NegativeInfinity, "Lower boundaries are not negatively infinite.") + assert(stringRule.boundaries.upper == Double.PositiveInfinity, "Upper boundaries are not positively infinite.") + + // Ensure that all rows passed; there are no failed rows assert(stringValidationResults.summaryReport.isEmpty) -// assert(stringValidated.except(stringLovExpectedDF).count() == 0, "Expected df is not equal to the returned rules report.") -// assert(stringValidated.filter(stringValidated("Invalid_Count") > 0).count() == 0) -// assert(stringValidated.filter(stringValidated("Failed") === true).count() == 0) } - } From 55c94efd171606b7ee7c0ee3988b94816d2a3056 Mon Sep 17 00:00:00 2001 From: Will Girten Date: Tue, 27 Jul 2021 14:09:36 -0400 Subject: [PATCH 02/10] Add tests for implicit and explicit expression rules. --- .../labs/validation/ValidatorTestSuite.scala | 72 +++++++++++++++++++ 1 file changed, 72 insertions(+) diff --git a/src/test/scala/com/databricks/labs/validation/ValidatorTestSuite.scala b/src/test/scala/com/databricks/labs/validation/ValidatorTestSuite.scala index fe54a77..a3f5b69 100644 --- a/src/test/scala/com/databricks/labs/validation/ValidatorTestSuite.scala +++ b/src/test/scala/com/databricks/labs/validation/ValidatorTestSuite.scala @@ -430,4 +430,76 @@ class ValidatorTestSuite extends AnyFunSuite with SparkSessionFixture { assert(stringValidationResults.summaryReport.isEmpty) } + test("The input df should have no rule failures for an implicit expression rule.") { + + val testDF = Seq( + (1, "iot_thermostat_1", 84.00, 74.00), + (2, "iot_thermostat_2", 67.05, 72.00), + (3, "iot_thermostat_3", 91.14, 76.00) + ).toDF("device_id", "device_name", "current_temp", "target_temp") + + val expectedColumns = testDF.columns ++ Seq("TemperatureDiffExpressionRule") + val expectedDF = Seq( + (1, "iot_thermostat_1", 84.00, 74.00, ValidationValue("TemperatureDiffExpressionRule", passed = true, "(abs((current_temp - target_temp)) < 50.0)", "true")), + (2, "iot_thermostat_2", 67.05, 72.00, ValidationValue("TemperatureDiffExpressionRule", passed = true, "(abs((current_temp - target_temp)) < 50.0)", "true")), + (3, "iot_thermostat_3", 91.14, 76.00, ValidationValue("TemperatureDiffExpressionRule", passed = true, "(abs((current_temp - target_temp)) < 50.0)", "true")) + ).toDF(expectedColumns : _*) + + val exprRuleSet = RuleSet(testDF) + exprRuleSet.add(Rule("TemperatureDiffExpressionRule", abs(col("current_temp") - col("target_temp")) < 50.00)) + + val validationResults = exprRuleSet.validate() + + // Ensure that there are no failed rows for rule expression + assert(validationResults.summaryReport.isEmpty) + + // Ensure that the ruleType is set correctly + assert(exprRuleSet.getRules.head.ruleType == RuleType.ValidateExpr) + assert(exprRuleSet.getRules.head.isImplicitBool) + + // Ensure that the complete report matches the expected output + assert(validationResults.completeReport.exceptAll(expectedDF).count() == 0, "Expected expression df is not equal to the returned rules report.") + + } + + test("The input df should have a single rule failure for an expression rule.") { + + val testDF = Seq( + (1, "iot_thermostat_1", 84.00, 74.00, -10.00, -10.00), + (2, "iot_thermostat_2", 76.00, 66.00, -10.00, -10.00), + (3, "iot_thermostat_3", 91.00, 69.00, -20.00, -10.00) + ).toDF("device_id", "device_name", "current_temp", "target_temp", "temp_diff", "cooling_rate") + + val expectedColumns = testDF.columns ++ Seq("ImplicitCoolingExpressionRule") + val expectedDF = Seq( + (1, "iot_thermostat_1", 84, 74, -10, -10, + ValidationValue("CoolingExpressionRule", passed = true, "abs(cooling_rate)", "10.0") + ), + (2, "iot_thermostat_2", 76, 66, -10, -10, + ValidationValue("CoolingExpressionRule", passed = true, "abs(cooling_rate)", "10.0") + ), + (3, "iot_thermostat_3", 91, 69, -20, -10, + ValidationValue("CoolingExpressionRule", passed = false, "abs(cooling_rate)", "10.0") + ) + ).toDF(expectedColumns : _*) + + val exprRuleSet = RuleSet(testDF) + // Create a rule that ensure the cooling rate can accommodate the temp difference + exprRuleSet.add(Rule("CoolingExpressionRule", abs(col("cooling_rate")), expr("abs(temp_diff)"))) + + val validationResults = exprRuleSet.validate() + + // Ensure that there is a single row failure + assert(validationResults.summaryReport.count() > 0) + assert(validationResults.summaryReport.count() == 1) + + // Ensure that the ruleType is set correctly + assert(exprRuleSet.getRules.head.ruleType == RuleType.ValidateExpr) + assert(!exprRuleSet.getRules.head.isImplicitBool) + + // Ensure that the complete report matches the expected output + assert(validationResults.completeReport.exceptAll(expectedDF).count() == 0, "Expected explicit expression df is not equal to the returned rules report.") + + } + } From 5360ef6abeba0cd7e915f3ba989594638dace488 Mon Sep 17 00:00:00 2001 From: "Daniel Tomes [GeekSheikh]" <10840635+geeksheikh@users.noreply.github.com> Date: Tue, 27 Jul 2021 16:48:47 -0400 Subject: [PATCH 03/10] imported outstanding spark sql functions --- .../com/databricks/labs/validation/ValidatorTestSuite.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/test/scala/com/databricks/labs/validation/ValidatorTestSuite.scala b/src/test/scala/com/databricks/labs/validation/ValidatorTestSuite.scala index a3f5b69..9553952 100644 --- a/src/test/scala/com/databricks/labs/validation/ValidatorTestSuite.scala +++ b/src/test/scala/com/databricks/labs/validation/ValidatorTestSuite.scala @@ -1,7 +1,7 @@ package com.databricks.labs.validation import com.databricks.labs.validation.utils.Structures.{Bounds, MinMaxRuleDef} -import org.apache.spark.sql.functions.{col, min} +import org.apache.spark.sql.functions._ import org.scalatest.funsuite.AnyFunSuite case class ValidationValue(ruleName: String, passed: Boolean, permitted: String, actual: String) From a8e08726650e11795f601daef6c2413e16f540ae Mon Sep 17 00:00:00 2001 From: Will Girten Date: Fri, 30 Jul 2021 18:11:32 -0400 Subject: [PATCH 04/10] Add test suite for Rules class. --- .../labs/validation/RuleTestSuite.scala | 113 ++++++++++++++++++ 1 file changed, 113 insertions(+) create mode 100644 src/test/scala/com/databricks/labs/validation/RuleTestSuite.scala diff --git a/src/test/scala/com/databricks/labs/validation/RuleTestSuite.scala b/src/test/scala/com/databricks/labs/validation/RuleTestSuite.scala new file mode 100644 index 0000000..82158af --- /dev/null +++ b/src/test/scala/com/databricks/labs/validation/RuleTestSuite.scala @@ -0,0 +1,113 @@ +package com.databricks.labs.validation + +import com.databricks.labs.validation.utils.Structures.Bounds +import org.apache.spark.sql.functions.col +import org.scalatest.funsuite.AnyFunSuite + + +class RuleTestSuite extends AnyFunSuite with SparkSessionFixture { + + import spark.implicits._ + + spark.sparkContext.setLogLevel("ERROR") + + test("A MinMaxRule should be instantiated correctly.") { + + val minMaxRule = Rule("Temperature_MinMax_Rule", col("temperature"), Bounds(34.0, 85.0)) + + // Ensure that all attributes are set correctly + assert(minMaxRule.ruleName == "Temperature_MinMax_Rule", "Rule name is not set as expected.") + assert(minMaxRule.inputColumnName == "temperature", "Input column name is not set as expected.") + assert(minMaxRule.ruleType == RuleType.ValidateBounds, "The rule type is not set as expected.") + assert(!minMaxRule.isImplicitBool, "The rule should not be an implicit boolean expression.") + assert(!minMaxRule.isAgg, "The rule should not be an aggregation.") + + // Ensure that the boundaries are set correctly + assert(minMaxRule.boundaries.lower == 34.0, "Lower boundary is not set as expected.") + assert(minMaxRule.boundaries.upper == 85.0, "Upper boundary is not set as expected.") + + } + + test("An implicit boolean expression should be instantiated correctly.") { + + // Ensure a single column of type boolean can be instantiated correctly + val coolingBoolRule = Rule("Implicit_Cooling_Rule", col("cooling_bool")) + + // Ensure that all attributes are set correctly + assert(coolingBoolRule.ruleName == "Implicit_Cooling_Rule", "Rule name is not set as expected.") + assert(coolingBoolRule.inputColumnName == "cooling_bool", "Input column name is not set as expected.") + assert(coolingBoolRule.ruleType == RuleType.ValidateExpr, "The rule type is not set as expected.") + assert(coolingBoolRule.isImplicitBool, "The rule should not be an implicit boolean expression.") + assert(!coolingBoolRule.isAgg, "The rule should not be an aggregation.") + + // Ensure that a boolean expression can be used to create an implicit boolean rule + val coolingExprRule = Rule("Implicit_Cooling_Expr", col("current_temp") > col("target_temp")) + + // Ensure that all attributes are set correctly + assert(coolingExprRule.ruleName == "Implicit_Cooling_Expr", "Rule name is not set as expected.") + assert(coolingExprRule.inputColumnName == "(current_temp > target_temp)", "Input column name is not set as expected.") + assert(coolingExprRule.ruleType == RuleType.ValidateExpr, "The rule type is not set as expected.") + assert(coolingExprRule.isImplicitBool, "The rule should not be an implicit boolean expression.") + assert(!coolingExprRule.isAgg, "The rule should not be an aggregation.") + + } + + test("A column can be ruled equivalent to an expression.") { + + // Ensure that equivalent comparision can be made between a column and expression + val coolingBoolRule = Rule("Thermostat_Cooling_Rule", col("cooling_bool"), (col("current_temp") - col("target_temp")) >= 7.0) + + // Ensure that all attributes are set correctly + assert(coolingBoolRule.ruleName == "Thermostat_Cooling_Rule", "Rule name is not set as expected.") + assert(coolingBoolRule.inputColumnName == "cooling_bool", "Input column name is not set as expected.") + assert(coolingBoolRule.ruleType == RuleType.ValidateExpr, "The rule type is not set as expected.") + assert(!coolingBoolRule.isImplicitBool, "The rule should not be an implicit boolean expression.") + assert(!coolingBoolRule.isAgg, "The rule should not be an aggregation.") + + } + + test("A list of numerical values rule can be instantiated correctly.") { + + // Ensure that a rule with a numerical LOV can be created + val heatingRateIntRule = Rule("Heating_Rate_Int_Rule", col("heating_rate"), Array(0, 1, 5, 10, 15)) + + // Ensure that all attributes are set correctly for Integers + assert(heatingRateIntRule.ruleName == "Heating_Rate_Int_Rule", "Rule name is not set as expected.") + assert(heatingRateIntRule.inputColumnName == "heating_rate", "Input column name is not set as expected.") + assert(heatingRateIntRule.ruleType == RuleType.ValidateNumerics, "The rule type is not set as expected.") + assert(!heatingRateIntRule.isImplicitBool, "The rule should not be an implicit boolean expression.") + assert(!heatingRateIntRule.isAgg, "The rule should not be an aggregation.") + + // Ensure that all attributes are set correctly for Doubles + val heatingRateDoubleRule = Rule("Heating_Rate_Double_Rule", col("heating_rate"), Array(0.0, 0.1, 0.5, 0.10, 0.15)) + assert(heatingRateDoubleRule.ruleName == "Heating_Rate_Double_Rule", "Rule name is not set as expected.") + assert(heatingRateDoubleRule.inputColumnName == "heating_rate", "Input column name is not set as expected.") + assert(heatingRateDoubleRule.ruleType == RuleType.ValidateNumerics, "The rule type is not set as expected.") + assert(!heatingRateDoubleRule.isImplicitBool, "The rule should not be an implicit boolean expression.") + assert(!heatingRateDoubleRule.isAgg, "The rule should not be an aggregation.") + + // Ensure that all attributes are set correctly for Longs + val heatingRateLongRule = Rule("Heating_Rate_Long_Rule", col("heating_rate"), Array(111111111111111L, 211111111111111L, 311111111111111L)) + assert(heatingRateLongRule.ruleName == "Heating_Rate_Long_Rule", "Rule name is not set as expected.") + assert(heatingRateLongRule.inputColumnName == "heating_rate", "Input column name is not set as expected.") + assert(heatingRateLongRule.ruleType == RuleType.ValidateNumerics, "The rule type is not set as expected.") + assert(!heatingRateLongRule.isImplicitBool, "The rule should not be an implicit boolean expression.") + assert(!heatingRateLongRule.isAgg, "The rule should not be an aggregation.") + + } + + test("A list of string values rule can be instantiated correctly.") { + + // Ensure that a rule with a numerical LOV can be created + val buildingNameRule = Rule("Building_LOV_Rule", col("site_name"), Array("SiteA", "SiteB", "SiteC")) + + // Ensure that all attributes are set correctly for Integers + assert(buildingNameRule.ruleName == "Building_LOV_Rule", "Rule name is not set as expected.") + assert(buildingNameRule.inputColumnName == "site_name", "Input column name is not set as expected.") + assert(buildingNameRule.ruleType == RuleType.ValidateStrings, "The rule type is not set as expected.") + assert(!buildingNameRule.isImplicitBool, "The rule should not be an implicit boolean expression.") + assert(!buildingNameRule.isAgg, "The rule should not be an aggregation.") + + } + +} From c92dba5fc83aa8849d121709e34fda690aa608c3 Mon Sep 17 00:00:00 2001 From: Will Girten Date: Mon, 2 Aug 2021 13:11:46 -0400 Subject: [PATCH 05/10] Add tests for RuleSet class. --- .../labs/validation/RuleSetTestSuite.scala | 161 ++++++++++++++++++ 1 file changed, 161 insertions(+) create mode 100644 src/test/scala/com/databricks/labs/validation/RuleSetTestSuite.scala diff --git a/src/test/scala/com/databricks/labs/validation/RuleSetTestSuite.scala b/src/test/scala/com/databricks/labs/validation/RuleSetTestSuite.scala new file mode 100644 index 0000000..0f46ff4 --- /dev/null +++ b/src/test/scala/com/databricks/labs/validation/RuleSetTestSuite.scala @@ -0,0 +1,161 @@ +package com.databricks.labs.validation + +import com.databricks.labs.validation.utils.Structures.Bounds +import org.apache.spark.sql.functions._ +import org.scalatest.funsuite.AnyFunSuite + + +class RuleSetTestSuite extends AnyFunSuite with SparkSessionFixture { + + import spark.implicits._ + + spark.sparkContext.setLogLevel("ERROR") + + test("A rule set should be created from a DataFrame.") { + val testDF = Seq( + (1, 2, 3), + (4, 5, 6), + (7, 8, 9) + ).toDF("retail_price", "scan_price", "cost") + val testRuleSet = RuleSet(testDF) + + // Ensure that the RuleSet DataFrame is set properly + assert(testRuleSet.getDf.exceptAll(testDF).count() == 0, "RuleSet DataFrame is not equal to the input DataFrame.") + + // Ensure that the RuleSet properties are set properly + assert(!testRuleSet.isGrouped) + assert(testRuleSet.getGroupBys.isEmpty) + assert(testRuleSet.getRules.isEmpty) + + } + + test("A rule set should be created from a DataFrame grouped by multiple columns.") { + val testDF = Seq( + ("food_a", 2.51, 3, 111111111111111L), + ("food_b", 5.11, 6, 211111111111111L), + ("food_b", 5.32, 7, 311111111111111L), + ("food_d", 8.22, 99, 411111111111111L) + ).toDF("product_name", "scan_price", "cost", "id") + val testRuleSet = RuleSet(testDF, Array("product_name", "id")) + + // Ensure that the RuleSet DataFrame is set properly + assert(testRuleSet.getDf.exceptAll(testDF).count() == 0, "RuleSet DataFrame is not equal to the input DataFrame.") + + // Ensure that the group-by columns are set properly + assert(testRuleSet.isGrouped) + assert(testRuleSet.getGroupBys.length == 2) + assert(testRuleSet.getGroupBys.contains("product_name")) + assert(testRuleSet.getGroupBys.contains("id")) + + // Ensure that the RuleSet properties are set properly + assert(testRuleSet.getRules.isEmpty) + + } + + test("A rule set should be created from a DataFrame grouped by a single column.") { + val testDF = Seq( + ("food_a", 2.51, 3, 111111111111111L), + ("food_b", 5.11, 6, 211111111111111L), + ("food_b", 5.32, 7, 311111111111111L), + ("food_d", 8.22, 99, 411111111111111L) + ).toDF("product_name", "scan_price", "cost", "id") + val testRuleSet = RuleSet(testDF, "product_name") + + // Ensure that the RuleSet DataFrame is set properly + assert(testRuleSet.getDf.exceptAll(testDF).count() == 0, "RuleSet DataFrame is not equal to the input DataFrame.") + + // Ensure that the group-by columns are set properly + assert(testRuleSet.isGrouped) + assert(testRuleSet.getGroupBys.length == 1) + assert(testRuleSet.getGroupBys.head == "product_name") + + // Ensure that the RuleSet properties are set properly + assert(testRuleSet.getRules.isEmpty) + + } + + test("A rule set should be created from a DataFrame and list of rules.") { + val testDF = Seq( + ("Toyota", "Camry", 30000.00, 111111111111111L), + ("Ford", "Escape", 18750.00, 211111111111111L), + ("Ford", "Mustang", 32000.00, 311111111111111L), + ("Nissan", "Maxima", 25000.00, 411111111111111L) + ).toDF("make", "model", "msrp", "id") + val makeLovRule = Rule("Valid_Auto_Maker_Rule", col("make"), Array("Ford", "Toyota", "Nissan", "BMW", "Chevrolet")) + val modelLovRule = Rule("Valid_Auto_Models_Rule", col("model"), Array("Camry", "Mustang", "Maxima", "Escape", "330i")) + val groupedRuleSet = RuleSet(testDF, Array(makeLovRule, modelLovRule), Array("make")) + + // Ensure that the RuleSet DataFrame is set properly + assert(groupedRuleSet.getDf.exceptAll(testDF).count() == 0, "RuleSet DataFrame is not equal to the input DataFrame.") + + // Ensure that the RuleSet properties are set properly + assert(groupedRuleSet.isGrouped) + assert(groupedRuleSet.getGroupBys.length == 1) + assert(groupedRuleSet.getGroupBys.head == "make") + assert(groupedRuleSet.getRules.length == 2) + assert((groupedRuleSet.getRules.map(_.ruleName) diff Seq("Valid_Auto_Maker_Rule", "Valid_Auto_Models_Rule")).isEmpty) + + // Ensure a RuleSet can be created with a non-grouped DataFrame + val nonGroupedRuleSet = RuleSet(testDF, Array(makeLovRule, modelLovRule)) + + // Ensure that the RuleSet DataFrame is set properly + assert(nonGroupedRuleSet.getDf.exceptAll(testDF).count() == 0, "RuleSet DataFrame is not equal to the input DataFrame.") + + // Ensure that the RuleSet properties are set properly + assert(nonGroupedRuleSet.getGroupBys.isEmpty) + assert(nonGroupedRuleSet.getRules.length == 2) + assert((nonGroupedRuleSet.getRules.map(_.ruleName) diff Seq("Valid_Auto_Maker_Rule", "Valid_Auto_Models_Rule")).isEmpty) + } + + test("A rule set should be created from a DataFrame and list of MinMax rules.") { + val testDF = Seq( + ("Toyota", "Camry", 30000.00, 111111111111111L), + ("Ford", "Escape", 18750.00, 211111111111111L), + ("Ford", "Mustang", 32000.00, 311111111111111L), + ("Nissan", "Maxima", 25000.00, 411111111111111L) + ).toDF("make", "model", "msrp", "id") + val msrpBoundsRuleSet = RuleSet(testDF).addMinMaxRules("Valid_Auto_MSRP_Rule", col("msrp"), Bounds(1.0, 100000.0)) + + // Ensure that the RuleSet DataFrame is set properly + assert(msrpBoundsRuleSet.getDf.exceptAll(testDF).count() == 0, "RuleSet DataFrame is not equal to the input DataFrame.") + + // Ensure that the RuleSet properties are set properly + assert(msrpBoundsRuleSet.getGroupBys.isEmpty) + assert(msrpBoundsRuleSet.getRules.length == 2) + assert(Seq("Valid_Auto_MSRP_Rule_min", "Valid_Auto_MSRP_Rule_max").contains(msrpBoundsRuleSet.getRules(0).ruleName)) + assert(Seq("Valid_Auto_MSRP_Rule_min", "Valid_Auto_MSRP_Rule_max").contains(msrpBoundsRuleSet.getRules(1).ruleName)) + + } + + test("Two rule sets can be merged together.") { + + val testDF = Seq( + ("Toyota", "Camry", 30000.00, 111111111111111L), + ("Ford", "Escape", 18750.00, 211111111111111L), + ("Ford", "Mustang", 32000.00, 311111111111111L), + ("Nissan", "Maxima", 25000.00, 411111111111111L) + ).toDF("make", "model", "msrp", "id") + + // Create a bounds RuleSet + val msrpBoundsRuleSet = RuleSet(testDF).addMinMaxRules("Valid_Auto_MSRP_Rule", col("msrp"), Bounds(1.0, 100000.0)) + + // Create a LOV RuleSet + val makeLovRule = Rule("Valid_Auto_Maker_Rule", col("make"), Array("Ford", "Toyota", "Nissan", "BMW", "Chevrolet")) + val modelLovRule = Rule("Valid_Auto_Models_Rule", col("model"), Array("Camry", "Mustang", "Maxima", "Escape", "330i")) + val groupedRuleSet = RuleSet(testDF, Array(makeLovRule, modelLovRule), Array("make")) + + // Merge both RuleSets + val mergedRuleSet = groupedRuleSet.add(msrpBoundsRuleSet) + + // Ensure that the RuleSet DataFrame is set properly + assert(mergedRuleSet.getDf.exceptAll(testDF).count() == 0, "RuleSet DataFrame is not equal to the input DataFrame.") + + // Ensure that the RuleSet properties are set properly + assert(mergedRuleSet.getGroupBys.isEmpty) + assert(mergedRuleSet.getRules.length == 2) + assert(Seq("Valid_Auto_MSRP_Rule_min", "Valid_Auto_MSRP_Rule_max").contains(msrpBoundsRuleSet.getRules(0).ruleName)) + assert(Seq("Valid_Auto_MSRP_Rule_min", "Valid_Auto_MSRP_Rule_max").contains(msrpBoundsRuleSet.getRules(1).ruleName)) + + } + +} From 4de0f8ea3600b737589840118d9136d864a82857 Mon Sep 17 00:00:00 2001 From: Will Girten Date: Tue, 3 Aug 2021 09:44:48 -0400 Subject: [PATCH 06/10] Add test for complex expressions on aggregates. --- .../labs/validation/ValidatorTestSuite.scala | 63 ++++++++++++++++++- 1 file changed, 60 insertions(+), 3 deletions(-) diff --git a/src/test/scala/com/databricks/labs/validation/ValidatorTestSuite.scala b/src/test/scala/com/databricks/labs/validation/ValidatorTestSuite.scala index 9553952..17e7bdc 100644 --- a/src/test/scala/com/databricks/labs/validation/ValidatorTestSuite.scala +++ b/src/test/scala/com/databricks/labs/validation/ValidatorTestSuite.scala @@ -2,6 +2,7 @@ package com.databricks.labs.validation import com.databricks.labs.validation.utils.Structures.{Bounds, MinMaxRuleDef} import org.apache.spark.sql.functions._ +import org.apache.spark.sql.expressions.Window import org.scalatest.funsuite.AnyFunSuite case class ValidationValue(ruleName: String, passed: Boolean, permitted: String, actual: String) @@ -443,7 +444,7 @@ class ValidatorTestSuite extends AnyFunSuite with SparkSessionFixture { (1, "iot_thermostat_1", 84.00, 74.00, ValidationValue("TemperatureDiffExpressionRule", passed = true, "(abs((current_temp - target_temp)) < 50.0)", "true")), (2, "iot_thermostat_2", 67.05, 72.00, ValidationValue("TemperatureDiffExpressionRule", passed = true, "(abs((current_temp - target_temp)) < 50.0)", "true")), (3, "iot_thermostat_3", 91.14, 76.00, ValidationValue("TemperatureDiffExpressionRule", passed = true, "(abs((current_temp - target_temp)) < 50.0)", "true")) - ).toDF(expectedColumns : _*) + ).toDF(expectedColumns: _*) val exprRuleSet = RuleSet(testDF) exprRuleSet.add(Rule("TemperatureDiffExpressionRule", abs(col("current_temp") - col("target_temp")) < 50.00)) @@ -481,7 +482,7 @@ class ValidatorTestSuite extends AnyFunSuite with SparkSessionFixture { (3, "iot_thermostat_3", 91, 69, -20, -10, ValidationValue("CoolingExpressionRule", passed = false, "abs(cooling_rate)", "10.0") ) - ).toDF(expectedColumns : _*) + ).toDF(expectedColumns: _*) val exprRuleSet = RuleSet(testDF) // Create a rule that ensure the cooling rate can accommodate the temp difference @@ -500,6 +501,62 @@ class ValidatorTestSuite extends AnyFunSuite with SparkSessionFixture { // Ensure that the complete report matches the expected output assert(validationResults.completeReport.exceptAll(expectedDF).count() == 0, "Expected explicit expression df is not equal to the returned rules report.") - } + } + + test("The input df should have 3 rule failures for complex expression rules.") { + + val testDF = Seq( + ("Northwest", 1001, 123256, 9.32, 8.99, 4.23, "2021-04-01", "2020-02-01 12:00:00.000"), // bad expiration date + ("Northwest", 1001, 123456, 19.99, 16.49, 12.99, "2021-07-26", "2020-02-02 12:08:00.000"), + ("Northwest", 1001, 123456, 0.99, 0.99, 0.10, "2021-07-26", "2020-02-02 12:10:00.000"), // price change too rapid -- same day + ("Northwest", 1001, 123456, 0.98, 0.90, 0.10, "2021-07-26", "2020-02-05 12:13:00.000"), + ("Northwest", 1001, 123456, 0.99, 0.99, 0.10, "2021-07-26", "2020-02-07 00:00:00.000"), + ("Northwest", 1001, 122987, -9.99, -9.49, -6.49, "2021-07-26", "2021-02-01 00:00:00.000"), + ).toDF("region", "store_id", "sku", "retail_price", "scan_price", "cost", "expiration_date", "create_ts") + .withColumn("create_ts", 'create_ts.cast("timestamp")) + .withColumn("create_dt", 'create_ts.cast("date")) + + // Limit price updates to at most one per day + val window = Window.partitionBy("region", "store_id", "sku").orderBy("create_ts") + val skuUpdateRule = Rule("One_Update_Per_Day_Rule", unix_timestamp(col("create_ts")) - unix_timestamp(lag("create_ts", 1).over(window)) > 60 * 60 * 24) + + // Limit expiration date to be within a range + val expirationDateRule = Rule("Expiration_Date_Rule", col("expiration_date").cast("date").between("2021-05-01", "2021-12-31")) + + // Group by region, store_id, sku, expiration_date, create_ts + val validDatesRuleset = RuleSet(testDF, Array(skuUpdateRule, expirationDateRule), Seq("region", "store_id", "sku", "expiration_date", "create_ts")) + val validDatesResults = validDatesRuleset.validate() + + // Ensure that there are 2 rule failures + assert(validDatesResults.summaryReport.count() == 2) + assert(validDatesResults.completeReport.filter(not(col("One_Update_Per_Day_Rule.passed"))).count() == 1) + assert(validDatesResults.completeReport.filter(not(col("Expiration_Date_Rule.passed"))).count() == 1) + assert(validDatesResults.completeReport.filter(not(col("One_Update_Per_Day_Rule.passed"))).select("sku").as[Int].collect.head == 123456) + assert(validDatesResults.completeReport.filter(not(col("Expiration_Date_Rule.passed"))).select("sku").as[Int].collect.head == 123256) + + // Ensure that the ruleTypes are set correctly + assert(validDatesRuleset.getRules.count(_.ruleType == RuleType.ValidateExpr) == 2) + assert(validDatesRuleset.getRules.count(_.isImplicitBool) == 2) + assert(validDatesRuleset.getGroupBys.length == 5) + + // Limit price columns to be non-negative amounts + val nonNegativeColumns = array(col("retail_price"), col("scan_price"), col("cost")) + val nonNegativeValueRule = Rule("Non_Negative_Values_Rule", size(filter(nonNegativeColumns, c => c <= 0.0)) === 0) + + // Group by region, store_id, sku, retail_price, scan_price, cost + val nonNegativeValuesRuleset = RuleSet(testDF, Array(nonNegativeValueRule), Seq("region", "store_id", "sku", "retail_price", "scan_price", "cost")) + val nonNegativeValuesResults = nonNegativeValuesRuleset.validate() + + // Ensure that there is 1 rule failure + assert(nonNegativeValuesResults.summaryReport.count() == 1) + assert(nonNegativeValuesResults.completeReport.filter(not(col("Non_Negative_Values_Rule.passed"))).count() == 1) + assert(nonNegativeValuesResults.completeReport.filter(not(col("Non_Negative_Values_Rule.passed"))).select("sku").as[Int].collect.head == 122987) + + // Ensure that the ruleType is set correctly + assert(nonNegativeValuesRuleset.getRules.head.ruleType == RuleType.ValidateExpr) + assert(nonNegativeValuesRuleset.getRules.head.isImplicitBool) + assert(nonNegativeValuesRuleset.getGroupBys.length == 6) + + } } From dd0ee61cf5fac1a8505893fdf1f1c30d9fdcdcd2 Mon Sep 17 00:00:00 2001 From: Will Girten Date: Tue, 3 Aug 2021 11:54:20 -0400 Subject: [PATCH 07/10] Fix isGrouped bug when groupBys array is empty by default or explicitly set. --- src/main/scala/com/databricks/labs/validation/RuleSet.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/scala/com/databricks/labs/validation/RuleSet.scala b/src/main/scala/com/databricks/labs/validation/RuleSet.scala index 54f092a..8361168 100644 --- a/src/main/scala/com/databricks/labs/validation/RuleSet.scala +++ b/src/main/scala/com/databricks/labs/validation/RuleSet.scala @@ -35,7 +35,7 @@ class RuleSet extends SparkSessionWrapper { private def setGroupByCols(value: Seq[String]): this.type = { _groupBys = value - _isGrouped = true + _isGrouped = !value.isEmpty this } From d37730d3e3619459380f651897eb0ec96c1d1c7b Mon Sep 17 00:00:00 2001 From: Will Girten Date: Tue, 3 Aug 2021 14:52:17 -0400 Subject: [PATCH 08/10] Fix overloaded add function that merges 2 RuleSets. --- .../databricks/labs/validation/RuleSet.scala | 19 ++++++++++--------- .../labs/validation/RuleSetTestSuite.scala | 17 +++++++++++++---- 2 files changed, 23 insertions(+), 13 deletions(-) diff --git a/src/main/scala/com/databricks/labs/validation/RuleSet.scala b/src/main/scala/com/databricks/labs/validation/RuleSet.scala index 8361168..9b07ba7 100644 --- a/src/main/scala/com/databricks/labs/validation/RuleSet.scala +++ b/src/main/scala/com/databricks/labs/validation/RuleSet.scala @@ -35,7 +35,7 @@ class RuleSet extends SparkSessionWrapper { private def setGroupByCols(value: Seq[String]): this.type = { _groupBys = value - _isGrouped = !value.isEmpty + _isGrouped = value.nonEmpty this } @@ -110,15 +110,16 @@ class RuleSet extends SparkSessionWrapper { } /** - * Merge two rule sets by adding one rule set to another - * - * @param ruleSet RuleSet to be added - * @return RuleSet - */ + * Merge two rule sets by adding one rule set to another + * + * @param ruleSet RuleSet to be added + * @return RuleSet + */ def add(ruleSet: RuleSet): RuleSet = { - new RuleSet().setDF(ruleSet.getDf) - .setIsGrouped(ruleSet.isGrouped) - .add(ruleSet.getRules) + val addtnlGroupBys = ruleSet.getGroupBys diff this.getGroupBys + val mergedGroupBys = this.getGroupBys ++ addtnlGroupBys + this.add(ruleSet.getRules) + .setGroupByCols(mergedGroupBys) } /** diff --git a/src/test/scala/com/databricks/labs/validation/RuleSetTestSuite.scala b/src/test/scala/com/databricks/labs/validation/RuleSetTestSuite.scala index 0f46ff4..a85dc04 100644 --- a/src/test/scala/com/databricks/labs/validation/RuleSetTestSuite.scala +++ b/src/test/scala/com/databricks/labs/validation/RuleSetTestSuite.scala @@ -148,13 +148,22 @@ class RuleSetTestSuite extends AnyFunSuite with SparkSessionFixture { val mergedRuleSet = groupedRuleSet.add(msrpBoundsRuleSet) // Ensure that the RuleSet DataFrame is set properly + assert(mergedRuleSet.getGroupBys.length == 1) assert(mergedRuleSet.getDf.exceptAll(testDF).count() == 0, "RuleSet DataFrame is not equal to the input DataFrame.") // Ensure that the RuleSet properties are set properly - assert(mergedRuleSet.getGroupBys.isEmpty) - assert(mergedRuleSet.getRules.length == 2) - assert(Seq("Valid_Auto_MSRP_Rule_min", "Valid_Auto_MSRP_Rule_max").contains(msrpBoundsRuleSet.getRules(0).ruleName)) - assert(Seq("Valid_Auto_MSRP_Rule_min", "Valid_Auto_MSRP_Rule_max").contains(msrpBoundsRuleSet.getRules(1).ruleName)) + assert(mergedRuleSet.getRules.length == 4) + val mergedRuleNames = Seq("Valid_Auto_MSRP_Rule_min", "Valid_Auto_MSRP_Rule_max", "Valid_Auto_Maker_Rule", "Valid_Auto_Models_Rule") + assert(mergedRuleSet.getRules.count(r => mergedRuleNames.contains(r.ruleName)) == 4) + + // Ensure groupBy columns are merged properly + val groupedLovRuleSet = RuleSet(testDF, Array(makeLovRule, modelLovRule), Array("make")) + val mergedTheOtherWay = msrpBoundsRuleSet.add(groupedLovRuleSet) + assert(mergedTheOtherWay.getGroupBys.length == 1) + assert(mergedTheOtherWay.getGroupBys.head == "make") + assert(mergedTheOtherWay.getDf.exceptAll(testDF).count() == 0) + mergedTheOtherWay.getRules.map(_.ruleName).foreach(println) + assert(mergedTheOtherWay.getRules.count(r => mergedRuleNames.contains(r.ruleName)) == 4) } From 698a7c7e8d892d34f855180a48011abfdd3014ee Mon Sep 17 00:00:00 2001 From: Will Girten Date: Wed, 4 Aug 2021 17:56:10 -0400 Subject: [PATCH 09/10] Add ignoreCase and invertMatch to ValidateStrings and ValidateNumerics rule types. --- .../com/databricks/labs/validation/Rule.scala | 72 +++++++++++++++-- .../labs/validation/Validator.scala | 7 +- .../labs/validation/RuleSetTestSuite.scala | 1 - .../labs/validation/ValidatorTestSuite.scala | 79 ++++++++++++++++--- 4 files changed, 137 insertions(+), 22 deletions(-) diff --git a/src/main/scala/com/databricks/labs/validation/Rule.scala b/src/main/scala/com/databricks/labs/validation/Rule.scala index 6e4b869..3f4006e 100644 --- a/src/main/scala/com/databricks/labs/validation/Rule.scala +++ b/src/main/scala/com/databricks/labs/validation/Rule.scala @@ -1,11 +1,8 @@ package com.databricks.labs.validation -import com.databricks.labs.validation.utils.Structures.{Bounds, ValidationException} +import com.databricks.labs.validation.utils.Structures.Bounds import org.apache.spark.sql.Column import org.apache.spark.sql.functions.{array, lit} -import org.apache.spark.sql.types.BooleanType - -import java.util.UUID /** * Definition of a rule @@ -21,6 +18,8 @@ class Rule( private var _validNumerics: Column = array(lit(null).cast("double")) private var _validStrings: Column = array(lit(null).cast("string")) private var _implicitBoolean: Boolean = false + private var _ignoreCase: Boolean = false + private var _invertMatch: Boolean = false val inputColumnName: String = inputColumn.expr.toString().replace("'", "") override def toString: String = { @@ -47,8 +46,8 @@ class Rule( this } - private def setValidStrings(value: Array[String]): this.type = { - _validStrings = lit(value) + private def setValidStrings(value: Array[String], ignoreCase: Boolean): this.type = { + _validStrings = if(ignoreCase) lit(value.map(_.toLowerCase)) else lit(value) inputColumn.expr.children.map(_.prettyName) this } @@ -63,6 +62,16 @@ class Rule( this } + private def setIgnoreCase(value: Boolean): this.type = { + _ignoreCase = value + this + } + + private def setInvertMatch(value: Boolean): this.type = { + _invertMatch = value + this + } + def boundaries: Bounds = _boundaries def validNumerics: Column = _validNumerics @@ -73,6 +82,10 @@ class Rule( def isImplicitBool: Boolean = _implicitBoolean + def ignoreCase: Boolean = _ignoreCase + + def invertMatch: Boolean = _invertMatch + def isAgg: Boolean = { inputColumn.expr.prettyName == "aggregateexpression" || inputColumn.expr.children.map(_.prettyName).contains("aggregateexpression") @@ -114,6 +127,18 @@ object Rule { .setValidExpr(validExpr) } + def apply( + ruleName: String, + column: Column, + validNumerics: Array[Double], + invertMatch: Boolean + ): Rule = { + + new Rule(ruleName, column, RuleType.ValidateNumerics) + .setValidNumerics(validNumerics) + .setInvertMatch(invertMatch) + } + def apply( ruleName: String, column: Column, @@ -122,6 +147,19 @@ object Rule { new Rule(ruleName, column, RuleType.ValidateNumerics) .setValidNumerics(validNumerics) + .setInvertMatch(false) + } + + def apply( + ruleName: String, + column: Column, + validNumerics: Array[Long], + invertMatch: Boolean + ): Rule = { + + new Rule(ruleName, column, RuleType.ValidateNumerics) + .setValidNumerics(validNumerics.map(_.toString.toDouble)) + .setInvertMatch(invertMatch) } def apply( @@ -132,6 +170,19 @@ object Rule { new Rule(ruleName, column, RuleType.ValidateNumerics) .setValidNumerics(validNumerics.map(_.toString.toDouble)) + .setInvertMatch(false) + } + + def apply( + ruleName: String, + column: Column, + validNumerics: Array[Int], + invertMatch: Boolean + ): Rule = { + + new Rule(ruleName, column, RuleType.ValidateNumerics) + .setValidNumerics(validNumerics.map(_.toString.toDouble)) + .setInvertMatch(invertMatch) } def apply( @@ -142,16 +193,21 @@ object Rule { new Rule(ruleName, column, RuleType.ValidateNumerics) .setValidNumerics(validNumerics.map(_.toString.toDouble)) + .setInvertMatch(false) } def apply( ruleName: String, column: Column, - validStrings: Array[String] + validStrings: Array[String], + ignoreCase: Boolean = false, + invertMatch: Boolean = false ): Rule = { new Rule(ruleName, column, RuleType.ValidateStrings) - .setValidStrings(validStrings) + .setValidStrings(validStrings, ignoreCase) + .setIgnoreCase(ignoreCase) + .setInvertMatch(invertMatch) } } diff --git a/src/main/scala/com/databricks/labs/validation/Validator.scala b/src/main/scala/com/databricks/labs/validation/Validator.scala index afd782b..d005311 100644 --- a/src/main/scala/com/databricks/labs/validation/Validator.scala +++ b/src/main/scala/com/databricks/labs/validation/Validator.scala @@ -29,16 +29,19 @@ class Validator(ruleSet: RuleSet, detailLvl: Int) extends SparkSessionWrapper { rule.inputColumn.cast("string").alias("actual") ).alias(rule.ruleName) case RuleType.ValidateNumerics => + val ruleExpr = if(rule.invertMatch) not(array_contains(rule.validNumerics, rule.inputColumn)) else array_contains(rule.validNumerics, rule.inputColumn) struct( lit(rule.ruleName).alias("ruleName"), - array_contains(rule.validNumerics, rule.inputColumn).alias("passed"), + ruleExpr.alias("passed"), rule.validNumerics.cast("string").alias("permitted"), rule.inputColumn.cast("string").alias("actual") ).alias(rule.ruleName) case RuleType.ValidateStrings => + val ruleValue = if(rule.ignoreCase) lower(rule.inputColumn) else rule.inputColumn + val ruleExpr = if(rule.invertMatch) not(array_contains(rule.validStrings, ruleValue)) else array_contains(rule.validStrings, ruleValue) struct( lit(rule.ruleName).alias("ruleName"), - array_contains(rule.validStrings, rule.inputColumn).alias("passed"), + ruleExpr.alias("passed"), rule.validStrings.cast("string").alias("permitted"), rule.inputColumn.cast("string").alias("actual") ).alias(rule.ruleName) diff --git a/src/test/scala/com/databricks/labs/validation/RuleSetTestSuite.scala b/src/test/scala/com/databricks/labs/validation/RuleSetTestSuite.scala index a85dc04..2cdad2e 100644 --- a/src/test/scala/com/databricks/labs/validation/RuleSetTestSuite.scala +++ b/src/test/scala/com/databricks/labs/validation/RuleSetTestSuite.scala @@ -162,7 +162,6 @@ class RuleSetTestSuite extends AnyFunSuite with SparkSessionFixture { assert(mergedTheOtherWay.getGroupBys.length == 1) assert(mergedTheOtherWay.getGroupBys.head == "make") assert(mergedTheOtherWay.getDf.exceptAll(testDF).count() == 0) - mergedTheOtherWay.getRules.map(_.ruleName).foreach(println) assert(mergedTheOtherWay.getRules.count(r => mergedRuleNames.contains(r.ruleName)) == 4) } diff --git a/src/test/scala/com/databricks/labs/validation/ValidatorTestSuite.scala b/src/test/scala/com/databricks/labs/validation/ValidatorTestSuite.scala index 17e7bdc..5bd6c50 100644 --- a/src/test/scala/com/databricks/labs/validation/ValidatorTestSuite.scala +++ b/src/test/scala/com/databricks/labs/validation/ValidatorTestSuite.scala @@ -346,7 +346,7 @@ class ValidatorTestSuite extends AnyFunSuite with SparkSessionFixture { assert(validationResults.completeReport.exceptAll(expectedDF).count() == 0, "Expected df is not equal to the returned rules report.") } - test("Validate list of values with numeric types, string types and long types.") { + test("Validate list of values with integer, double, and long types.") { val testDF = Seq( ("food_a", 2.51, 3, 111111111111111L), ("food_b", 5.11, 6, 211111111111111L), @@ -396,36 +396,93 @@ class ValidatorTestSuite extends AnyFunSuite with SparkSessionFixture { // Ensure that all rows passed the Rules assert(numericValidationResults.summaryReport.isEmpty) + // Ensure rows can be validated against a list of invalid numerics + val invalidNumColumns = testDF.columns ++ Seq("CheckIfCostIsInLOV", "CheckIfScanPriceIsInLOV", "CheckIfIdIsInLOV") + val invalidNumsExpectedDF = Seq( + ("food_a", 2.51, 3, 111111111111111L, + ValidationValue("Invalid_Price_Rule", passed = true, "[-1.0, -5.0, 0.0, 1000.0]", "2.51"), + ValidationValue("Invalid_Id_Rule", passed = true, "[7.11111111111111E14, 8.11111111111111E14, 9.11111111111111E14]", "111111111111111"), + ValidationValue("Invalid_Cost_Rule", passed = true, "[99.0, 10000.0, 100000.0, 1000000.0]", "3") + ), + ("food_b", 5.11, 6, 211111111111111L, + ValidationValue("Invalid_Price_Rule", passed = true, "[-1.0, -5.0, 0.0, 1000.0]", "5.11"), + ValidationValue("Invalid_Id_Rule", passed = true, "[7.11111111111111E14, 8.11111111111111E14, 9.11111111111111E14]", "211111111111111"), + ValidationValue("Invalid_Cost_Rule", passed = true, "[99.0, 10000.0, 100000.0, 1000000.0]", "6") + ), + ("food_c", 8.22, 99, 311111111111111L, + ValidationValue("Invalid_Price_Rule", passed = true, "[-1.0, -5.0, 0.0, 1000.0]", "8.22"), + ValidationValue("Invalid_Id_Rule", passed = true, "[7.11111111111111E14, 8.11111111111111E14, 9.11111111111111E14]", "311111111111111"), + ValidationValue("Invalid_Cost_Rule", passed = false, "[99.0, 10000.0, 100000.0, 1000000.0]", "99") + ) + ).toDF(expectedColumns: _*) + + val invalidPrices = Array(-1.00, -5.00, 0.00, 1000.0) + val invalidIds = Array(711111111111111L, 811111111111111L, 911111111111111L) + val invalidCosts = Array(99, 10000, 100000, 1000000) + val invalidNumericalRules = Array( + Rule("Invalid_Price_Rule", col("scan_price"), invalidPrices, invertMatch = true), + Rule("Invalid_Id_Rule", col("id"), invalidIds, invertMatch = true), + Rule("Invalid_Cost_Rule", col("cost"), invalidCosts, invertMatch = true), + ) + val invalidNumericalResults = RuleSet(testDF).add(invalidNumericalRules).validate() + + // Ensure that there is 1 failed row + assert(invalidNumericalResults.summaryReport.count() == 1) + + // Ensure that the invertMatch attribute is set properly + assert(invalidNumericalRules.count(_.invertMatch) == 3) + + // Ensure that the validation report matches expected output + assert(invalidNumericalResults.completeReport.exceptAll(invalidNumsExpectedDF).count() == 0, "Expected invalid numerics df is not equal to the returned rules report.") + + } + + test("The input df should have no rule failures for valid string LOVs.") { + val testDF = Seq( + ("food_a", 2.51, 3, 111111111111111L), + ("food_b", 5.11, 6, 211111111111111L), + ("food_c", 8.22, 99, 311111111111111L) + ).toDF("product_name", "scan_price", "cost", "id") + // Create a String List of Values Rule - val stringRule = Rule("CheckIfProductNameInLOV", col("product_name"), Array("food_a", "food_b", "food_c")) + val validProductNamesRule = Rule("CheckIfProductNameInLOV", col("product_name"), Array("food_a", "food_b", "food_c")) + val stringIgnoreCaseRule = Rule("IgnoreCaseProductNameLOV", col("product_name"), Array("Food_B", "food_A", "FOOD_C"), ignoreCase = true) + val invalidFoodsRule = Rule("InvalidProductNameLOV", col("product_name"), Array("food_x", "food_y", "food_z"), invertMatch = true) - val expectedStringLovColumns = testDF.columns ++ Seq("CheckIfProductNameInLOV") + val expectedStringLovColumns = testDF.columns ++ Seq("CheckIfProductNameInLOV", "IgnoreCaseProductNameLOV", "InvalidProductNameLOV") val stringLovExpectedDF = Seq( ("food_a", 2.51, 3, 111111111111111L, - ValidationValue("CheckIfProductNameInLOV", passed = true, "[food_a, food_b, food_c]", "food_a") + ValidationValue("CheckIfProductNameInLOV", passed = true, "[food_a, food_b, food_c]", "food_a"), + ValidationValue("IgnoreCaseProductNameLOV", passed = true, "[food_b, food_a, food_c]", "food_a"), + ValidationValue("InvalidProductNameLOV", passed = true, "[food_x, food_y, food_z]", "food_a") ), ("food_b", 5.11, 6, 211111111111111L, - ValidationValue("CheckIfProductNameInLOV", passed = true, "[food_a, food_b, food_c]", "food_b") + ValidationValue("CheckIfProductNameInLOV", passed = true, "[food_a, food_b, food_c]", "food_b"), + ValidationValue("IgnoreCaseProductNameLOV", passed = true, "[food_b, food_a, food_c]", "food_b"), + ValidationValue("InvalidProductNameLOV", passed = true, "[food_x, food_y, food_z]", "food_b") ), ("food_c", 8.22, 99, 311111111111111L, - ValidationValue("CheckIfProductNameInLOV", passed = true, "[food_a, food_b, food_c]", "food_c") + ValidationValue("CheckIfProductNameInLOV", passed = true, "[food_a, food_b, food_c]", "food_c"), + ValidationValue("IgnoreCaseProductNameLOV", passed = true, "[food_b, food_a, food_c]", "food_c"), + ValidationValue("InvalidProductNameLOV", passed = true, "[food_x, food_y, food_z]", "food_c") ) ).toDF(expectedStringLovColumns: _*) // Validate testDF against String LOV Rule - val stringRuleSet = RuleSet(testDF) - stringRuleSet.add(stringRule) + val productNameRules = Array(validProductNamesRule, stringIgnoreCaseRule, invalidFoodsRule) + val stringRuleSet = RuleSet(testDF).add(productNameRules) + val stringValidationResults = stringRuleSet.validate() // Ensure that the ruleType is set properly - assert(stringRule.ruleType == RuleType.ValidateStrings) + assert(validProductNamesRule.ruleType == RuleType.ValidateStrings) // Ensure that the complete report matches expected output assert(stringValidationResults.completeReport.exceptAll(stringLovExpectedDF).count() == 0, "Expected String LOV df is not equal to the returned rules report.") // Ensure that there are infinite boundaries, by default - assert(stringRule.boundaries.lower == Double.NegativeInfinity, "Lower boundaries are not negatively infinite.") - assert(stringRule.boundaries.upper == Double.PositiveInfinity, "Upper boundaries are not positively infinite.") + assert(validProductNamesRule.boundaries.lower == Double.NegativeInfinity, "Lower boundaries are not negatively infinite.") + assert(validProductNamesRule.boundaries.upper == Double.PositiveInfinity, "Upper boundaries are not positively infinite.") // Ensure that all rows passed; there are no failed rows assert(stringValidationResults.summaryReport.isEmpty) From eda625e9764c3a1e6c9a620dcd56541de3c7fdd0 Mon Sep 17 00:00:00 2001 From: Will Girten Date: Fri, 6 Aug 2021 10:51:28 -0400 Subject: [PATCH 10/10] Update documentation with latest features in categorical Rules. --- README.md | 15 +++++- demo/Example.scala | 19 +++---- demo/Rules_Engine_Examples.dbc | Bin 5198 -> 5390 bytes demo/Rules_Engine_Examples.html | 49 +++++++++--------- .../labs/validation/utils/Structures.scala | 4 +- 5 files changed, 52 insertions(+), 35 deletions(-) diff --git a/README.md b/README.md index 3cfaae7..7939491 100644 --- a/README.md +++ b/README.md @@ -174,7 +174,8 @@ someRuleSet.addMinMaxRules("Retail_Price_Validation", col("retail_price"), Bound ### Categorical Rules There are two types of categorical rules which are used to validate against a pre-defined list of valid values. As of 0.2 accepted categorical types are String, Double, Int, Long but any types outside of this can -be input as an array() column of any type so long as it can be evaulated against the intput column +be input as an array() column of any type so long as it can be evaluated against the input column. + ```scala val catNumerics = Array( Rule("Valid_Stores", col("store_id"), Lookups.validStoreIDs), @@ -187,6 +188,18 @@ Rule("Valid_Regions", col("region"), Lookups.validRegions) ) ``` +An optional `ignoreCase` parameter can be specified when evaluating against a list of String values to ignore or apply +case-sensitivity. By default, input columns will be evaluated against a list of Strings with case-sensitivity applied. +```scala +Rule("Valid_Regions", col("region"), Lookups.validRegions, ignoreCase=true) +``` + +Furthermore, the evaluation of categorical rules can be inverted by specifying `invertMatch=true` as a parameter. +This can be handy when defining a Rule that an input column cannot match list of invalid values. For example: +```scala +Rule("Invalid_Skus", col("sku"), Lookups.invalidSkus, invertMatch=true) +``` + ### Validation Now that you have some rules built up... it's time to build the ruleset and validate it. As mentioned above, the dataframe can be a simple df or a grouped df by passing column[s] to perform validation at the diff --git a/demo/Example.scala b/demo/Example.scala index ec278ed..1f52a12 100644 --- a/demo/Example.scala +++ b/demo/Example.scala @@ -50,11 +50,12 @@ object Example extends App with SparkSessionWrapper { val catNumerics = Array( Rule("Valid_Stores", col("store_id"), Lookups.validStoreIDs), - Rule("Valid_Skus", col("sku"), Lookups.validSkus) + Rule("Valid_Skus", col("sku"), Lookups.validSkus), + Rule("Invalid_Skus", col("sku"), Lookups.invalidSkus, invertMatch=true) ) val catStrings = Array( - Rule("Valid_Regions", col("region"), Lookups.validRegions) + Rule("Valid_Regions", col("region"), Lookups.validRegions, ignoreCase=true) ) //TODO - validate datetime @@ -76,18 +77,18 @@ object Example extends App with SparkSessionWrapper { .withColumn("create_dt", 'create_ts.cast("date")) // Doing the validation - // The validate method will return the rules report dataframe which breaks down which rules passed and which - // rules failed and how/why. The second return value returns a boolean to determine whether or not all tests passed -// val (rulesReport, passed) = RuleSet(df, Array("store_id")) - val (rulesReport, passed) = RuleSet(df) + // The validate method will return two reports - a complete report and a summary report. + // The complete report is verbose and will add all rule validations to the right side of the original + // df passed into RuleSet, while the summary report will contain all of the rows that failed one or more + // Rule evaluations. + val validationResults = RuleSet(df) .add(specializedRules) .add(minMaxPriceRules) .add(catNumerics) .add(catStrings) - .validate(2) + .validate() - rulesReport.show(200, false) -// rulesReport.printSchema() + validationResults.completeReport.show(200, false) } diff --git a/demo/Rules_Engine_Examples.dbc b/demo/Rules_Engine_Examples.dbc index f4770a77b4ac2700be2b59dca5406415b3c175e9..3d21bf789730abd89ccf9f866cf841be36c8c0a1 100644 GIT binary patch delta 5327 zcmV;=6fo<~D2^%)P)h>@6aWYa2msP`22+s^Gk@E*vi~X=ow-x{TtOj7@a1Vb_cU(L z$(^Rn#A(kTk;erI(B=t6sw5T1H{C2gn5BsPi1VzF54{uU4B!|0u= zo4hJVC!-&$RwY&S_7^r9kE%Mqd@e^P%!#8g4)}OfikX5^uNH-BF1{--^HN=W_g>8A z@P7^!3Q=Az#3dGKQc(yQbEt`F9(#4{!AaPzbd8^xJ6Q?f&`T1;K)h&U9 zi4?6!>O6hhObU@SlXs%XAq{Fx7HwWMlYg_eUZm}!R?XyMTJBbJS+J@vCt@zrD>Z56 zqJBGRek&%~qD-+|^H9ML&#GcEE63y%+WNkR!MI)a^_Nb#3?$gJqQ#zPd6_p?3Pl1O zFNzih;={;_>#CgVM0{ zOqII(x9}>ZlK;--rRq{@HLKnY(%zgK!8EFUUS^d|nWuWv>t z<)SET+Ur`R$|&C;wR^d)>bF@@U0b3sR(1P2$C<_aC~$)yjw2_Cv}Vup^{l`TCQGb$`GA}P}SMzedXjeKf^JZR%8#`NIs0sx#yUt~M zwVXXyD!;sHm+$Y5z?-gDH*4?n{9V;mXNs#P}OvZxYK{32@GVgXaCX@7<)CpR`s6KgF+v-4Rb4+@ zNwvIPs zH=6yjR_}6k{Uc14*VVI`e19%qh`A|I8WS8RkxE$>Qt8WpaxNtm(i7B;vN%mL*YzE5 zbUWVBvDQv#my0O95yEK(>>l${FS;yEr0AwPr+%{?UfP@ zn9DJlj?Tbv4qjj6@>+E-y6Ey%7fnGqYf%hROo2^}pS@iS_;fw)2f(T8f_U+_=j!*+ z^Erp#f#))JOuWG7u7~dtysweX%e#c*j&j_Qjyu|MM?CJR#~t~&rw^8Qie!n*fM%0U zUgY`Uc7KE{5OTC84=eYs?TH7}qe&c(37=pC(~;}A4s~4WFyc5T`tQVX z9FW!c0g^^Av8Kj?3CCZqej8HVfsp3S(KWlD%y~4GNdI)UQ6W#Bkg_T-z(BV-I6~6A zU93@l9D7LkMcy=X1;ZkV(dn%`yGtE25{$I~jTkj)wo3ZBQGY7*#p($+H^`-z!3bXC~&lrKPF%GZ<7>ECS0@j86^FY}}9230UNoOePZ&kQpR_saAz- z&i4#vKXkF2#1IO*4d5O9<-LrfONOkO7|5#8pnrsr6n_~V(l&jx^i5Id>j=>WKvWIU zpt4%W)qVX=CQAw+8+EC1*OG(fKCP z(&*>f?qkx=z5ypVqaGk|4r-|bi{}zFHwRs@;n3|D3#%|daSa<$?(+nBp{*BILd&A~#NV539mHtV%tXi#Spg0y7BLyfmSa zk|7l$Wy)p30hsl;u*zEvtE_4t$!w#a-ObbL=&GN zg7Zg0&<;9o2Z&P(5vOR}m#AR$;dXQ{ESz@8?tzL!EVyHx%>Zy};nxH0Z9sjy)m(*o zJAc->i|!BymQ0<|-Qdfk$JW@gR2MfJf?axRkHr(*K&t zY56zu?IJHE(c9D;;OPVVqk|c|&$;ewo8JLA<(2vE9D>cmq~718R-R0QD1j)@@qZvF zj9E;D%RLHjNt%Hd@m;q2q>lSZtpX*pI0g(avygHbhA^LjxEJ{W&-~O2n0(x%_Sb?< zl@U;MaL;MADBG8+#w{~(sg4Xddx;3^gmec05Ou929d{4pso4M=)+2OT;g~!jgDks| z_2db$XIw~?5tz}X6h&1k(&Rd^27fbSw!7@H2K8z{{;4)YN7Io>ywIr^zw`$na1k43 z`_cLu`|+533%!-i5#Vs#Yl&lA4qseW6FAF&%)XBhwANXK=h^AB>@@p?^JB_hT`i z+c$VUHv1l{?Wdwh7dRuOWq@RLH6s-mjv4Y#em?zvz|#xy{z4Skh{SC&%?Yq%HV$Uu z{n2L4*&=e=g~8s&@o~2~WBb$fAl}%4ey0wkz5qEJ<>B-Je zDRg?M1VGCVQz>Z7!U*FZh^rDAQ7M>+6E}iD>yal}gH@=62LRfy8Z;cUsw8a%z^Md9 z0B;T@ZpbVzpMX;&2|5^o>n{`X`bst6>_tfu)fblpe0^TrXuJU>x{RtO-H}XP#gxLD z>OE+)#QpG^;POzNgMZc0EHt3}O?G3pvw<`4lkWj%{kF)#450=ai~_uYGuP(+RAs9W z<%P~edk0zQ_4PaYQ3I%UL{WMb)jI8MB+>V;(Okg;A5HO@JsDwWnR60rRG4c$yajmw z>;|m`$%_u+8RVwX5T=8?a47qMxg*66>EZ7))lRyxUGmu+t$zvqgk1c66U;yxyg!(6 zr_r6}GPkmW-Fm@n~wOA!AgG zAqA}y>jXeS)(7~rffmNOAI6@)8w?3j?gVKP>7 zDGMTB@jo?uihrW*t4ny|VIzF<+~Ba%bbwELgikVbxJp?}x$qcZs4Son6A@K0pd97} zzRRQC;S=lOQ^q3c$DvDw$6dS#Bx&M?ocgi!JsCw=7CiE_cz6ds{bsWpW>e70RHuZ zVs^tDA{uh7iUNO^oqihRegmABU$vxE{ZN472!GNjJagY#^a!nCRPY}#j#yriD}^UZ z?FhiI*JdrUQ`9rt?wX!H12aGyRpRV5N|M~5cm&v<3Heubt)O#b@@sxs zg2G|FUkNeYE8tN}Jr5=$Cb2cT2jJi$@WPf!ZNl3{CDrq@ZsqMdqoRKi};7={Jr zjDPdGV%UNppq|ryG&bR2yu-3TAUv=MpKt9cO;zNjyTT!rgp8*d^na-bpN}_{#=ifj z#$JG;w%*hW06cHOUV(d9x;kL5Jz}rKmoko1mwHhI0L;PNIDkhq7H*Q^(NE#TyJIi6 z$6ik6G2c;wGA=^Ovn-{;fo5EmaWBduz<+iAaqGVKef_0_X?fZKfTnXDcIyos zm>_5BH@$CU6u2uf2U`p71)X6-rlaHroPqXu=+&1uF^tR9cVn~@n~<-Q`kQI_+_?W9 zdZ4T8N-eDzxaAz&GByWpP3LcL8r=DV9XP;KLx%$g2UGN!Op(LWP^OYmC-E7YqJMx2 zt~{!Elyb+*eAjb!H$~pk6ot}pqRgQwR}tlD6rgqUspBP4rP@^rs0r{CW!B`if&)Y zi47`#tXe#efK6d^C&6INMT_-I9Z3*S9)V^KP2vK=+{K*SREyd+k9{y4N&PUxays8G zFTZKdC%Pf~-dkdjYofy@sIX`dv?CAQqBM{@s?GCBmv~G7zFk$)#3MSpd4Ht^Jhoyu zUM_3n9R$Sxn$r4??lNb6*>ssR0O2c%7W|sDE~-D;6cvPf4GIR`-Q^-7_T~a-m;a*61S8(iY)!T{IkYYxLP}9QMK(-FNBm{(Ex0^ge;c@_Y{- z^;xLpne1J9p_T2_+^Vysi+}24E>3C!ol@UzTCWK}GB_^n4%aRerd8SEV$hG!m}%&C z=a%)x;+JjGE^O(^x@*JzfPQ?#=eN77LANaQ3{TYDGnn^N`Ci-cWp(R39;EPxxN`}i zF5iwsgDqz}T+id5;b{8-mHw4csR|w82Q(2Zfj~BZ09o*aN-vTCN`GY%tB(kfd5=mx zKao6*U7GM9rJQ9670mIeA7qj1GCbZc9*0VqvkFU%!``$ixmWn18t-$1f-h))z>@fU ztNjVLY>v8K;2rGG!T#*g{$znjvos57n79GuZtTz~$B{4=?u<#Z>c?^U=Tq> zEdW%uiNwlg77$s5x&u;{UjHRBDBFLA44&;b$lz&k8+hgGV}ISN&N{!#gYCsyU|~3T zWA2pealwn=`T_6QAhrDv~&ZV?qJ);DEqx+KAxNXT?pS0zwdS@d_&N_ zTgB~>`sl+{KYvX}CsP}()1IZN4focG zb;>>@IAlpjwmNeV+v=R|w)l9u65md{0QV!gF}m=(voSP&AJ0su3y){!{PsRQ+7jw_ z8gO6L_4t0;Mfiuy_g^ML2)ylE_z~X{Aq27i8Mg45$A98Ogz(pl5JEptj!Yctur#7v zGM~ntaHxzUCV3n>e)JKyPlCZkU6y$=kU-4`x;%m>M+)l1e&{kDdqMix6$3HS6J*m z|E_yOk($E#I3V##nJ`&EYeNAasgtdaLerIU4-@UK{ zvN|BE1F||Gs{^vK7-{Q5S?Fy!%7Y)W^$y7DfUNfTnMywhg%>NzBpxsi0AnRl#;Alp zfPi=sM;~#NXZe}RNX9WhHboWVfULa4qp=DcnuI>%PUK0*dnB^*H(hr5oBj`S$kM|! z(|_`{xxIs@j7E;c6Oa;tU znnoh>lq=nYdwOR~ismY*L@hhvS$h%sB5`6tMJgoaP8v`FGo8k%=ewB`vRFl^u<{&xu_AU1R0Z>Z^2r#`5>0T570Bdys08mQ-0u%rg h00;;O0Mc~^Q!u>`>0T570Bd!V1r$RDs}ukL003&NN-+Qc delta 5153 zcmV++6yEEOD$XblP)h>@6aWYa2mpg^hft9YGk@GRlK(2G*}95!HyjguQx&InW5s*E ziW8URWdHD}v>*VExaN=(K5Xk;{_fWefE>`zuXHkqd4EOeY0fN+|TY&1HS{L$SyTdG*6bwp_tG zlz-1zv1r)>i`1Ou43c=cT(Uycle4#<2F*Ov$vavJr;|mS!RsVVQkmGywL~7rfF;4>KmQT+NOlWRas|^YM##Qz6|7N%T-x51QKR~ zH7uzz{;rwT5IFPmLye_JdnxnO#i>GZai&tUzsx`FwJI%QS&o#$=cl*?Cn+5EgxtE~Hs91Vrv zTvw6_J&`7<{xz#J970C{)*q7pGm{mo`1OrhxGYUy6t%4Sm4t+Inaf6+gnva|CM^Gz zRi?!pwoBdo0##rV1S>L`&!A_r{_B8sp%0T`QIuJFjGeg28tmExh0=BT;n_y2>F{+X zq{(@zf~2xIia>qrv!dFr#p(jmJ8&}zJK%_RAkj$OICw; zoRazE0tDsY^<^eL$nHfIUB9ZL92mA@`5;9X*wy&OyLQk|$MyUGdg?fF6i(sQQLmxv zdmj7_T!%VS;s(CwxcDByJ5U8dI4_W-NP$+(XRv8lp36M@Z+UVde>=g32|3y7u9+6o zXgqU)b~KCQDe-1Ve}6u4Y{#~2$FeE0?KAavX4^IlWqJ=u-SJpaqruF>UygbkQXN4^ z_03i_yPwoPm`bF6+WV-GCr?OG7FQrsn+)_4so%9*ln?735`LN0^-99B2%=eTZ=ao> z1&E%(SWTdwMn#%#l73MrWG}Dq`r<}$7yvXP)YBB4_z+6&V4vMLs^QcZ06##BkQ#;06(lUGvRx%N1eFQ4c6tg5 z=lnS|S?SeR;-M>U-hkcILj?jq%db{IQJO5yis3siJDI~l7c}y!sf~moWcDFzuC?|) z*~p)gZ~C_x2Y-nM*-g(blheEEi)MFy&^!{W?gFazLjqMHfd*`zO${(FM0&OxG4?1VJ$6`-kUzfNZ)D;hGJ%}79&kOTr+=AyoQU~KuRv#dl46mi&qrBR*T*>TjE zkhNrkxvCWgpCF3Fjt*(-F&g^1D2#Q2x*t%bLRC;1V1LE3#}l7EqpmmgyE$1?fVq(i zz)V9D2Ab@n@vi-v>O;m~(-CE>VeRN%^tS>(igw3QS0R!$;A;faJSh<^4 zj$!u`k?zu~&PcV|Z4*7IZ{M!L&Sa_(tO2kJ`YYtGfd2sGycxp&|A2q?WSBjjr}$B|9ZOES%nYH5BhL4Bm8kV$w%y^Gk-P4fSOQc3aCjP*a7>9(z}Ri3qXNx z)lvG@YY#-VL7;`;u1GnyJZwJa7S925EjjVGU&uv&wBkGFHMwR7s zyMG!iC9G|Q_^VfuY_T}kQv6cATy$%#Uw__F{sQb#S^t;;K6@?E!VtsUA2i&<=Hbn# zB~(d66r=TUR{vLB7F~-PY>FhUwSs@e5Q6EF+Xw0j=>fWWrU25pR%#{}QSO5P(a7_h zy+AY$b^}pLBcC3D=n;rM_HjE8SR9HFfPbhDh>?ScPh#O&vE#V0@4B3ea4Zmw?H-5* zA`a5X4y+)SuH}V(YDJMFtT5m(uA8=nt+&~1Uz1BUeSCVH8-D^V0#I^4brel*LGuvGf4TZ2oKLiH2 z3dPM8Am=N!8nqq(Je}xaD3`$Ly89p?Wd-dyY8>rqW!~WRRJ%1c+vhCjEq`uEVLCw4 zs$7y1gvk>7PkuRnKIqd+_VJ44*oZ(cnCc01N&5ws?BmI9&e=2NW(Wh<#p!A1doknb zMiB2DK<|iyS#M#UHFNa6{iVo$YvmO>QTus2H&yz%n{GEnv88#j>trPud$cmjW3h9t z&dBCI+vx$=vH9*OcX~(OynooSjn3{1+QIpSnHo{*ua4{4!cEe7vArwK^rm<_g34vT zgT^!&B!QGJI^;%m(2f&Nf{ie$v>R~WbJOVb&lp?;c8g;>!Cp8-_Z+)ZC&G}VBQ!cf zqeq2CZpxVxA$$qJ&;hwX0mJyVzC%0u}ES*YdJJL*jhphlu5bx3NI z^gfcP>$hlb=>IRK_{;=4Xj(e#VT%fNj7B$r?(f~AH8At6gLE3HDfENsFkiTo{leUu z#SYow?+e+?y0u;Rvp(6H^fPkx?>hq_qKAYPw;CO#W^YCCZhy@8S^;k^!#1Z(hYT_u zEP`Ii=|3t4C`)Kk2wQiP>$zmQW{!iYA%{$nF@_WrPizwa2H9TVM*}I;^ZYP&{c*4( z;GP}uBp3;40_g~vjx0^NOx+;jBWWx1HXYe-kJBLXrT3?WOHs7{^aukOcETms363X8 zN4WIZa7l!=Cx1DOEswbrFjE>>5oM7jV-_bAV2$HNqv4X@!zI@hX)GAC{Fn<1j2USq zlm-^J-AMS%j}sXl2A9HHaOo!<_6VAj!vri!{j5$;X6Dpb`@RSxIj|)vVq?n#v;o}H zS*L=q1RuxPy47BOTS7S_fbIlrZ&E88GB;Yt$K@7qYX4C** zTf`EE?6bDQodv^ct}_@*jc~JtA>NqT6C?#(C_rwKq;0mKV_yWJ4I{{YDMk-YiEQ}n z_VD}Dl7D<~MzT3>C^EoP{iwC~xF)P7A7q~6@1kP>4TW!rD2s0zQpkR+uriF)5@XUE z1IMN$Lzw+9kVsfwl52^Pmu84Cua!Q`>p0=Uv^!*kjWL-R{vnRlF}B+yGX?h?|KJUr zhr?+D7Jj^=5YA=f^0C@lCL#5B3d3K|>f`lI@_*R(|FqBml>FXnS^?m;Jac$46Qg8z7;bkNgc{;X2&C;0kzjoCMj$#sE8~#4S}qbfyH?2 zS~5)~PbG6hmL3KTqMd%@UUT#z?epTi(@%AGYGOP;ApRM-kiRLng=Xrm#0q39C=^sB zHGi2;k{b~9N?)KQzmD_Q3P;_%fYQm$jC`9^-_45`sVdlOM%1ufskPJsrIvwG!{%hA z^LOY7@19EykLc~jh?S$a`zVNhPeEjRJQN@^EIaWj3L=mgo&<^EMclL9)OTEav>RO3lDWf5GLVxKUCWzu&HVU4<_JXKCepP$Dl1*C`r1?-9 zt7HlysFLay`uTH|CwhWN%Ca0!%?MB>|J35>pD2tNNy;*pOe19@X!-(ApAw9h!^8qj z7UNeo;;e1~z14LJYXp$EghJJa43?3A>5$`vQ-atStEl#+m>EF+Q`w+19GgPZLw|s{ zTCoP}7YcR z`CjuP&{6{7u`aO4a3cXAR3xz+Zw{{m3!ESgJwI_QPMNSUL;*iQ6#>>| zN$ezvYkT%#D3scpFjOn(U4O3O2ZbB5b{`uAJdnHr$>Do@G zw*o=#p`rppZj;#=LcNEKikMb`(zodh0*Uw3o>(9NVTu|6pmZ0B4M!9Z*@UtKQifjt zktrzKe_;xq&F7`yX>hZ2sZ{O}Ev@864#Dc_wMYX3&Dh!xRGig?gWYXIN zhFp3lnGG)8L8ptN`VwZplgyX*=HV_E@37Cu-7nr@l#i>p-x8mEnyaVz3^%gTh%VJEfGglcyVa^(b%gE)Q;A% zdcL2AiKIZ0!+#GNAz4>iQ5vUF61$<42j2Rm?kz{y-;K+=_rNP3&yNygu~=P$4kI7n zFT=}@B|5>=vRV!tzVdnTpfhq{m~8}=Rsrj zzt91+@l`_O{2BU#l=H{?f8hLih))q79Y{w9($RtRdw)5QoYQ#wwDOo zic@vfBKDaV$vBdSI*_(L#4{FAZD6TC7)m{?Z|jJpj!5c=q>f1H|2&dHOtkkPEOhsT z;@}5O-G3vJIwGmZ{-U8Dgv^blMFj?g2Y`E$D5aKwKMr%eB#uUh;?VW~1;^u5#`sYW zU*P$g<4G$C18IfK;fy6g;!|-TlJa+5P5F!Z&r--*qtx@_jlQplmr;x@m7`0r`Z_6D zCAt|k22zmPQIK#e_5zLuQe?%T;4Bddk4isrd5m|_x95M)K(^WW^Zx))O9u#b2g1Rg z6953&asU8OO928D02BZS2nYa!ZHG{G2g1Rg6953&asU7u00aO4000000000000000 P0F&7iF9z!q00000B7o{d diff --git a/demo/Rules_Engine_Examples.html b/demo/Rules_Engine_Examples.html index 15b5d5a..b85b426 100644 --- a/demo/Rules_Engine_Examples.html +++ b/demo/Rules_Engine_Examples.html @@ -10,33 +10,34 @@ - - - - - - - + + + + + + diff --git a/src/main/scala/com/databricks/labs/validation/utils/Structures.scala b/src/main/scala/com/databricks/labs/validation/utils/Structures.scala index eb7bda0..b5aec4a 100644 --- a/src/main/scala/com/databricks/labs/validation/utils/Structures.scala +++ b/src/main/scala/com/databricks/labs/validation/utils/Structures.scala @@ -13,7 +13,9 @@ object Lookups { final val validRegions = Array("Northeast", "Southeast", "Midwest", "Northwest", "Southcentral", "Southwest") - final val validSkus = Array(123456, 122987,123256, 173544, 163212, 365423, 168212) + final val validSkus = Array(123456, 122987, 123256, 173544, 163212, 365423, 168212) + + final val invalidSkus = Array(9123456, 9122987, 9123256, 9173544, 9163212, 9365423, 9168212) }