diff --git a/docs/compatibility.md b/docs/compatibility.md index 9d411f56d50..4e1d604b1ea 100644 --- a/docs/compatibility.md +++ b/docs/compatibility.md @@ -83,6 +83,18 @@ after Spark 3.1.0. We do not disable operations that produce different results due to `-0.0` in the data because it is considered to be a rare occurrence. +### `NaN` vs `NaN` + +Apache Spark does not have a consistent way to handle `NaN` comparison. Sometimes, all `NaN` are +considered as one unique value while other times they can be treated as different. The outcome of +`NaN` comparison can differ in various operations and also changed between Spark versions. +The RAPIDS Accelerator tries to match its output with Apache Spark except for a few operation(s) listed below: + - `IN` SQL expression: `NaN` can be treated as different values in Spark 3.1.2 and + prior versions, see [SPARK-36792](https://issues.apache.org/jira/browse/SPARK-36792) for more details. +The RAPIDS Accelerator compares `NaN` values as equal for this operation which matches +the behavior of Apache Spark 3.1.3 and later versions. + + ## Decimal Support Apache Spark supports decimal values with a precision up to 38. This equates to 128-bits. diff --git a/integration_tests/src/main/python/cmp_test.py b/integration_tests/src/main/python/cmp_test.py index 59759098f28..1e1549f28be 100644 --- a/integration_tests/src/main/python/cmp_test.py +++ b/integration_tests/src/main/python/cmp_test.py @@ -17,7 +17,7 @@ from asserts import assert_gpu_and_cpu_are_equal_collect from conftest import is_not_utc from data_gen import * -from spark_session import with_cpu_session, is_before_spark_330 +from spark_session import with_cpu_session, is_before_spark_313, is_before_spark_330 from pyspark.sql.types import * from marks import datagen_overrides, allow_non_gpu import pyspark.sql.functions as f @@ -335,11 +335,16 @@ def test_in(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark : unary_op_df(spark, data_gen).select(f.col('a').isin(scalars))) +# We avoid testing inset with NaN in Spark < 3.1.3 since it has issue with NaN comparisons. +# See https://github.com/NVIDIA/spark-rapids/issues/9687. +test_inset_data_gen = [gen for gen in eq_gens_with_decimal_gen if gen != float_gen if gen != double_gen] + \ + [FloatGen(no_nans=True), DoubleGen(no_nans=True)] \ + if is_before_spark_313() else eq_gens_with_decimal_gen + # Spark supports two different versions of 'IN', and it depends on the spark.sql.optimizer.inSetConversionThreshold conf # This is to test entries over that value. -@datagen_overrides(seed=0, reason='https://github.com/NVIDIA/spark-rapids/issues/9687') -@pytest.mark.parametrize('data_gen', eq_gens_with_decimal_gen, ids=idfn) @allow_non_gpu(*non_utc_allow) +@pytest.mark.parametrize('data_gen', test_inset_data_gen, ids=idfn) def test_in_set(data_gen): # nulls are not supported for in on the GPU yet num_entries = int(with_cpu_session(lambda spark: spark.conf.get('spark.sql.optimizer.inSetConversionThreshold'))) + 1