From ea7b7fecabed86407e0dda9e9da4e579059239c4 Mon Sep 17 00:00:00 2001 From: Nghia Truong <7416935+ttnghia@users.noreply.github.com> Date: Fri, 1 Dec 2023 18:42:22 -0800 Subject: [PATCH] Test `inset` with `NaN` only for Spark from 3.1.3 (#9928) * Fix test and update docs Signed-off-by: Nghia Truong * Update docs/compatibility.md Co-authored-by: Jason Lowe * Update docs/compatibility.md Co-authored-by: Jason Lowe * Fix docs Signed-off-by: Nghia Truong * Update integration_tests/src/main/python/cmp_test.py --------- Signed-off-by: Nghia Truong Co-authored-by: Jason Lowe --- docs/compatibility.md | 12 ++++++++++++ integration_tests/src/main/python/cmp_test.py | 11 ++++++++--- 2 files changed, 20 insertions(+), 3 deletions(-) diff --git a/docs/compatibility.md b/docs/compatibility.md index 9d411f56d50..4e1d604b1ea 100644 --- a/docs/compatibility.md +++ b/docs/compatibility.md @@ -83,6 +83,18 @@ after Spark 3.1.0. We do not disable operations that produce different results due to `-0.0` in the data because it is considered to be a rare occurrence. +### `NaN` vs `NaN` + +Apache Spark does not have a consistent way to handle `NaN` comparison. Sometimes, all `NaN` are +considered as one unique value while other times they can be treated as different. The outcome of +`NaN` comparison can differ in various operations and also changed between Spark versions. +The RAPIDS Accelerator tries to match its output with Apache Spark except for a few operation(s) listed below: + - `IN` SQL expression: `NaN` can be treated as different values in Spark 3.1.2 and + prior versions, see [SPARK-36792](https://issues.apache.org/jira/browse/SPARK-36792) for more details. +The RAPIDS Accelerator compares `NaN` values as equal for this operation which matches +the behavior of Apache Spark 3.1.3 and later versions. + + ## Decimal Support Apache Spark supports decimal values with a precision up to 38. This equates to 128-bits. diff --git a/integration_tests/src/main/python/cmp_test.py b/integration_tests/src/main/python/cmp_test.py index a891b667016..f2e08339363 100644 --- a/integration_tests/src/main/python/cmp_test.py +++ b/integration_tests/src/main/python/cmp_test.py @@ -17,7 +17,7 @@ from asserts import assert_gpu_and_cpu_are_equal_collect from conftest import is_not_utc from data_gen import * -from spark_session import with_cpu_session, is_before_spark_330 +from spark_session import with_cpu_session, is_before_spark_313, is_before_spark_330 from pyspark.sql.types import * from marks import datagen_overrides import pyspark.sql.functions as f @@ -346,11 +346,16 @@ def test_in(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark : unary_op_df(spark, data_gen).select(f.col('a').isin(scalars))) +# We avoid testing inset with NaN in Spark < 3.1.3 since it has issue with NaN comparisons. +# See https://github.com/NVIDIA/spark-rapids/issues/9687. +test_inset_data_gen = [gen for gen in eq_gens_with_decimal_gen if gen != float_gen if gen != double_gen] + \ + [FloatGen(no_nans=True), DoubleGen(no_nans=True)] \ + if is_before_spark_313() else eq_gens_with_decimal_gen + # Spark supports two different versions of 'IN', and it depends on the spark.sql.optimizer.inSetConversionThreshold conf # This is to test entries over that value. -@datagen_overrides(seed=0, reason='https://github.com/NVIDIA/spark-rapids/issues/9687') -@pytest.mark.parametrize('data_gen', eq_gens_with_decimal_gen, ids=idfn) @pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@pytest.mark.parametrize('data_gen', test_inset_data_gen, ids=idfn) def test_in_set(data_gen): # nulls are not supported for in on the GPU yet num_entries = int(with_cpu_session(lambda spark: spark.conf.get('spark.sql.optimizer.inSetConversionThreshold'))) + 1