From 778fa00c327dc3562261ef3bb28c84ed28127570 Mon Sep 17 00:00:00 2001 From: Chong Gao Date: Tue, 28 Nov 2023 11:25:34 +0330 Subject: [PATCH] Change xfail to allow_non_gpu Signed-off-by: Chong Gao --- integration_tests/src/main/python/aqe_test.py | 3 +- .../src/main/python/cast_test.py | 40 +++++------ integration_tests/src/main/python/cmp_test.py | 4 +- .../src/main/python/collection_ops_test.py | 12 ++-- .../src/main/python/conditionals_test.py | 6 +- integration_tests/src/main/python/csv_test.py | 16 ++--- integration_tests/src/main/python/data_gen.py | 6 +- .../src/main/python/datasourcev2_read_test.py | 3 +- .../src/main/python/date_time_test.py | 68 +++++++++---------- .../src/main/python/explain_test.py | 2 +- .../src/main/python/generate_expr_test.py | 20 +++--- .../src/main/python/hash_aggregate_test.py | 44 ++++++------ .../main/python/hive_delimited_text_test.py | 12 ++-- .../src/main/python/hive_write_test.py | 2 +- .../src/main/python/join_test.py | 58 ++++++++-------- .../src/main/python/json_test.py | 31 ++++----- integration_tests/src/main/python/map_test.py | 32 ++++----- .../src/main/python/orc_cast_test.py | 9 +-- integration_tests/src/main/python/orc_test.py | 16 ++--- .../src/main/python/orc_write_test.py | 15 ++-- .../src/main/python/parquet_test.py | 18 ++--- .../src/main/python/parquet_testing_test.py | 5 +- .../src/main/python/parquet_write_test.py | 30 ++++---- .../src/main/python/qa_nightly_select_test.py | 8 +-- .../src/main/python/repart_test.py | 10 +-- .../src/main/python/sort_test.py | 14 ++-- .../src/main/python/window_function_test.py | 41 ++++++----- 27 files changed, 260 insertions(+), 265 deletions(-) diff --git a/integration_tests/src/main/python/aqe_test.py b/integration_tests/src/main/python/aqe_test.py index 189bef329d72..7f2f68c58803 100755 --- a/integration_tests/src/main/python/aqe_test.py +++ b/integration_tests/src/main/python/aqe_test.py @@ -194,9 +194,8 @@ def do_it(spark): # broadcast join. The bug currently manifests in Databricks, but could # theoretically show up in other Spark distributions @ignore_order(local=True) -@allow_non_gpu('BroadcastNestedLoopJoinExec', 'Cast', 'DateSub', *db_113_cpu_bnlj_join_allow) +@allow_non_gpu('BroadcastNestedLoopJoinExec', 'Cast', 'DateSub', *db_113_cpu_bnlj_join_allow, *non_utc_allow) @pytest.mark.parametrize('join', joins, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_aqe_join_reused_exchange_inequality_condition(spark_tmp_path, join): data_path = spark_tmp_path + '/PARQUET_DATA' def prep(spark): diff --git a/integration_tests/src/main/python/cast_test.py b/integration_tests/src/main/python/cast_test.py index 725b95a6590d..f3614347ac87 100644 --- a/integration_tests/src/main/python/cast_test.py +++ b/integration_tests/src/main/python/cast_test.py @@ -61,7 +61,7 @@ def test_cast_nested(data_gen, to_type): assert_gpu_and_cpu_are_equal_collect( lambda spark : unary_op_df(spark, data_gen).select(f.col('a').cast(to_type))) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) @datagen_overrides(seed=0, reason="https://github.com/NVIDIA/spark-rapids/issues/9781") def test_cast_string_date_valid_format(): # In Spark 3.2.0+ the valid format changed, and we cannot support all of the format. @@ -91,7 +91,7 @@ def test_cast_string_date_valid_format(): # Spark 320+ and databricks support Ansi mode when casting string to date # This means an exception will be thrown when casting invalid string to date on Spark 320+ or databricks # test Spark versions < 3.2.0 and non databricks, ANSI mode -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) @pytest.mark.skipif(not is_before_spark_320(), reason="ansi cast(string as date) throws exception only in 3.2.0+ or db") def test_cast_string_date_invalid_ansi_before_320(): data_rows = [(v,) for v in values_string_to_data] @@ -101,7 +101,7 @@ def test_cast_string_date_invalid_ansi_before_320(): 'spark.sql.ansi.enabled': 'true'}, ) # test Spark versions >= 320 and databricks, ANSI mode, valid values -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) @pytest.mark.skipif(is_before_spark_320(), reason="Spark versions(< 320) not support Ansi mode when casting string to date") def test_cast_string_date_valid_ansi(): data_rows = [(v,) for v in valid_values_string_to_date] @@ -112,7 +112,7 @@ def test_cast_string_date_valid_ansi(): # test Spark versions >= 320, ANSI mode @pytest.mark.skipif(is_before_spark_320(), reason="ansi cast(string as date) throws exception only in 3.2.0+") -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) @pytest.mark.parametrize('invalid', invalid_values_string_to_date) def test_cast_string_date_invalid_ansi(invalid): assert_gpu_and_cpu_error( @@ -145,7 +145,7 @@ def test_try_cast_fallback_340(invalid): 'spark.sql.ansi.enabled': True}) # test all Spark versions, non ANSI mode, invalid value will be converted to NULL -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_cast_string_date_non_ansi(): data_rows = [(v,) for v in values_string_to_data] assert_gpu_and_cpu_are_equal_collect( @@ -157,7 +157,7 @@ def test_cast_string_date_non_ansi(): StringGen('[0-9]{1,4}-[0-3][0-9]-[0-5][0-9][ |T][0-3][0-9]:[0-6][0-9]:[0-6][0-9]'), StringGen('[0-9]{1,4}-[0-3][0-9]-[0-5][0-9][ |T][0-3][0-9]:[0-6][0-9]:[0-6][0-9].[0-9]{0,6}Z?')], ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_cast_string_ts_valid_format(data_gen): # In Spark 3.2.0+ the valid format changed, and we cannot support all of the format. # This provides values that are valid in all of those formats. @@ -305,7 +305,7 @@ def _assert_cast_to_string_equal (data_gen, conf): @pytest.mark.parametrize('data_gen', all_array_gens_for_cast_to_string, ids=idfn) @pytest.mark.parametrize('legacy', ['true', 'false']) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_cast_array_to_string(data_gen, legacy): _assert_cast_to_string_equal( data_gen, @@ -325,7 +325,7 @@ def test_cast_array_with_unmatched_element_to_string(data_gen, legacy): @pytest.mark.parametrize('data_gen', basic_map_gens_for_cast_to_string, ids=idfn) @pytest.mark.parametrize('legacy', ['true', 'false']) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_cast_map_to_string(data_gen, legacy): _assert_cast_to_string_equal( data_gen, @@ -345,7 +345,7 @@ def test_cast_map_with_unmatched_element_to_string(data_gen, legacy): @pytest.mark.parametrize('data_gen', [StructGen([[str(i), gen] for i, gen in enumerate(basic_array_struct_gens_for_cast_to_string)] + [["map", MapGen(ByteGen(nullable=False), null_gen)]])], ids=idfn) @pytest.mark.parametrize('legacy', ['true', 'false']) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_cast_struct_to_string(data_gen, legacy): _assert_cast_to_string_equal( data_gen, @@ -410,7 +410,7 @@ def test_cast_string_to_negative_scale_decimal(): @pytest.mark.skipif(is_before_spark_330(), reason="ansi cast throws exception only in 3.3.0+") @pytest.mark.parametrize('type', [DoubleType(), FloatType()], ids=idfn) @pytest.mark.parametrize('invalid_value', [float("inf"), float("-inf"), float("nan")]) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_cast_float_to_timestamp_ansi_for_nan_inf(type, invalid_value): def fun(spark): data = [invalid_value] @@ -422,7 +422,7 @@ def fun(spark): @pytest.mark.skipif(is_before_spark_330(), reason="ansi cast throws exception only in 3.3.0+") @pytest.mark.parametrize('type', [DoubleType(), FloatType()], ids=idfn) @pytest.mark.parametrize('invalid_value', [float(LONG_MAX) + 100, float(LONG_MIN) - 100]) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_cast_float_to_timestamp_ansi_overflow(type, invalid_value): def fun(spark): data = [invalid_value] @@ -431,7 +431,7 @@ def fun(spark): assert_gpu_and_cpu_error(fun, {"spark.sql.ansi.enabled": True}, "ArithmeticException") @pytest.mark.skipif(is_before_spark_330(), reason='330+ throws exception in ANSI mode') -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_cast_float_to_timestamp_side_effect(): def getDf(spark): data = [(True, float(LONG_MAX) + 100), (False, float(1))] @@ -443,7 +443,7 @@ def getDf(spark): # non ansi mode, will get null @pytest.mark.parametrize('type', [DoubleType(), FloatType()], ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_cast_float_to_timestamp_for_nan_inf(type): def fun(spark): data = [(float("inf"),), (float("-inf"),), (float("nan"),)] @@ -463,7 +463,7 @@ def fun(spark): short_gen, int_gen, long_gen_to_timestamp], ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_cast_integral_to_timestamp(gen, ansi_enabled): if(is_before_spark_330() and ansi_enabled): # 330- does not support in ANSI mode pytest.skip() @@ -472,7 +472,7 @@ def test_cast_integral_to_timestamp(gen, ansi_enabled): conf={"spark.sql.ansi.enabled": ansi_enabled}) @pytest.mark.parametrize('ansi_enabled', [True, False], ids=['ANSI_ON', 'ANSI_OFF']) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_cast_float_to_timestamp(ansi_enabled): if(is_before_spark_330() and ansi_enabled): # 330- does not support in ANSI mode pytest.skip() @@ -482,7 +482,7 @@ def test_cast_float_to_timestamp(ansi_enabled): conf={"spark.sql.ansi.enabled": ansi_enabled}) @pytest.mark.parametrize('ansi_enabled', [True, False], ids=['ANSI_ON', 'ANSI_OFF']) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_cast_double_to_timestamp(ansi_enabled): if (is_before_spark_330() and ansi_enabled): # 330- does not support in ANSI mode pytest.skip() @@ -500,7 +500,7 @@ def test_cast_double_to_timestamp(ansi_enabled): (INT_MIN - 1, IntegerType()), ], ids=idfn) @pytest.mark.skipif(is_before_spark_330(), reason="Spark 330- does not ansi casting between numeric and timestamp") -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_cast_timestamp_to_integral_ansi_overflow(invalid_and_type): (invalid, to_type) = invalid_and_type assert_gpu_and_cpu_error( @@ -511,7 +511,7 @@ def test_cast_timestamp_to_integral_ansi_overflow(invalid_and_type): error_message="overflow") @pytest.mark.skipif(is_before_spark_330(), reason="Spark 330- does not ansi casting between numeric and timestamp") -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_cast_timestamp_to_numeric_ansi_no_overflow(): data = [datetime.fromtimestamp(i) for i in range(BYTE_MIN, BYTE_MAX + 1)] assert_gpu_and_cpu_are_equal_collect( @@ -520,14 +520,14 @@ def test_cast_timestamp_to_numeric_ansi_no_overflow(): "cast(value as float)", "cast(value as double)"), conf=ansi_enabled_conf) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_cast_timestamp_to_numeric_non_ansi(): assert_gpu_and_cpu_are_equal_collect( lambda spark: unary_op_df(spark, timestamp_gen) .selectExpr("cast(a as byte)", "cast(a as short)", "cast(a as int)", "cast(a as long)", "cast(a as float)", "cast(a as double)")) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_cast_timestamp_to_string(): assert_gpu_and_cpu_are_equal_collect( lambda spark: unary_op_df(spark, timestamp_gen) diff --git a/integration_tests/src/main/python/cmp_test.py b/integration_tests/src/main/python/cmp_test.py index b451f9c3db78..8a92c2681822 100644 --- a/integration_tests/src/main/python/cmp_test.py +++ b/integration_tests/src/main/python/cmp_test.py @@ -19,7 +19,7 @@ from data_gen import * from spark_session import with_cpu_session, is_before_spark_330 from pyspark.sql.types import * -from marks import datagen_overrides +from marks import datagen_overrides, allow_non_gpu import pyspark.sql.functions as f @pytest.mark.parametrize('data_gen', eq_gens_with_decimal_gen + struct_gens_sample_with_decimal128_no_list, ids=idfn) @@ -336,7 +336,7 @@ def test_in(data_gen): # This is to test entries over that value. @datagen_overrides(seed=0, reason='https://github.com/NVIDIA/spark-rapids/issues/9687') @pytest.mark.parametrize('data_gen', eq_gens_with_decimal_gen, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_in_set(data_gen): # nulls are not supported for in on the GPU yet num_entries = int(with_cpu_session(lambda spark: spark.conf.get('spark.sql.optimizer.inSetConversionThreshold'))) + 1 diff --git a/integration_tests/src/main/python/collection_ops_test.py b/integration_tests/src/main/python/collection_ops_test.py index 539282708f37..87fde6f9ac10 100644 --- a/integration_tests/src/main/python/collection_ops_test.py +++ b/integration_tests/src/main/python/collection_ops_test.py @@ -23,6 +23,8 @@ import pyspark.sql.utils from spark_session import with_cpu_session, with_gpu_session from conftest import get_datagen_seed +from marks import allow_non_gpu + nested_gens = [ArrayGen(LongGen()), ArrayGen(decimal_gen_128bit), StructGen([("a", LongGen()), ("b", decimal_gen_128bit)]), @@ -251,7 +253,7 @@ def test_sort_array_normalize_nans(): gens in sequence_normal_integral_gens] @pytest.mark.parametrize('start_gen,stop_gen', sequence_normal_no_step_integral_gens, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_sequence_without_step(start_gen, stop_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark: two_col_df(spark, start_gen, stop_gen).selectExpr( @@ -260,7 +262,7 @@ def test_sequence_without_step(start_gen, stop_gen): "sequence(20, b)")) @pytest.mark.parametrize('start_gen,stop_gen,step_gen', sequence_normal_integral_gens, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_sequence_with_step(start_gen, stop_gen, step_gen): # Get the datagen seed we use for all datagens, since we need to call start # on step_gen @@ -309,7 +311,7 @@ def test_sequence_with_step(start_gen, stop_gen, step_gen): ] @pytest.mark.parametrize('start_gen,stop_gen,step_gen', sequence_illegal_boundaries_integral_gens, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_sequence_illegal_boundaries(start_gen, stop_gen, step_gen): assert_gpu_and_cpu_error( lambda spark:three_col_df(spark, start_gen, stop_gen, step_gen).selectExpr( @@ -324,7 +326,7 @@ def test_sequence_illegal_boundaries(start_gen, stop_gen, step_gen): ] @pytest.mark.parametrize('stop_gen', sequence_too_long_length_gens, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_sequence_too_long_sequence(stop_gen): assert_gpu_and_cpu_error( # To avoid OOM, reduce the row number to 1, it is enough to verify this case. @@ -366,7 +368,7 @@ def get_sequence_data(gen, len): mixed_schema) # test for 3 cases mixed in a single dataset -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_sequence_with_step_mixed_cases(): assert_gpu_and_cpu_are_equal_collect( lambda spark: get_sequence_cases_mixed_df(spark) diff --git a/integration_tests/src/main/python/conditionals_test.py b/integration_tests/src/main/python/conditionals_test.py index 1bc11c5d27b0..b418483fb10c 100644 --- a/integration_tests/src/main/python/conditionals_test.py +++ b/integration_tests/src/main/python/conditionals_test.py @@ -19,7 +19,7 @@ from data_gen import * from spark_session import is_before_spark_320, is_jvm_charset_utf8 from pyspark.sql.types import * -from marks import datagen_overrides +from marks import datagen_overrides, allow_non_gpu import pyspark.sql.functions as f def mk_str_gen(pattern): @@ -233,7 +233,7 @@ def test_conditional_with_side_effects_case_when(data_gen): conf = test_conf) @pytest.mark.parametrize('data_gen', [mk_str_gen('[a-z]{0,3}')], ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_conditional_with_side_effects_sequence(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark : unary_op_df(spark, data_gen).selectExpr( @@ -244,7 +244,7 @@ def test_conditional_with_side_effects_sequence(data_gen): @pytest.mark.skipif(is_before_spark_320(), reason='Earlier versions of Spark cannot cast sequence to string') @pytest.mark.parametrize('data_gen', [mk_str_gen('[a-z]{0,3}')], ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_conditional_with_side_effects_sequence_cast(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark : unary_op_df(spark, data_gen).selectExpr( diff --git a/integration_tests/src/main/python/csv_test.py b/integration_tests/src/main/python/csv_test.py index c10221a44072..c7330a06c331 100644 --- a/integration_tests/src/main/python/csv_test.py +++ b/integration_tests/src/main/python/csv_test.py @@ -248,7 +248,7 @@ def read_impl(spark): @pytest.mark.parametrize('read_func', [read_csv_df, read_csv_sql]) @pytest.mark.parametrize('v1_enabled_list', ["", "csv"]) @pytest.mark.parametrize('ansi_enabled', ["true", "false"]) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_basic_csv_read(std_input_path, name, schema, options, read_func, v1_enabled_list, ansi_enabled, spark_tmp_table_factory): updated_conf=copy_and_update(_enable_all_types_conf, { 'spark.sql.sources.useV1SourceList': v1_enabled_list, @@ -289,7 +289,7 @@ def test_csv_read_small_floats(std_input_path, name, schema, options, read_func, @approximate_float @pytest.mark.parametrize('data_gen', csv_supported_gens, ids=idfn) @pytest.mark.parametrize('v1_enabled_list', ["", "csv"]) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_round_trip(spark_tmp_path, data_gen, v1_enabled_list): gen = StructGen([('a', data_gen)], nullable=False) data_path = spark_tmp_path + '/CSV_DATA' @@ -406,7 +406,7 @@ def test_read_valid_and_invalid_dates(std_input_path, filename, v1_enabled_list, @pytest.mark.parametrize('ts_part', csv_supported_ts_parts) @pytest.mark.parametrize('date_format', csv_supported_date_formats) @pytest.mark.parametrize('v1_enabled_list', ["", "csv"]) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_ts_formats_round_trip(spark_tmp_path, date_format, ts_part, v1_enabled_list): full_format = date_format + ts_part data_gen = TimestampGen() @@ -476,8 +476,7 @@ def test_input_meta_fallback(spark_tmp_path, v1_enabled_list, disable_conf): cpu_fallback_class_name = 'FileSourceScanExec' if v1_enabled_list == 'csv' else 'BatchScanExec', conf=updated_conf) -@allow_non_gpu('DataWritingCommandExec,ExecutedCommandExec,WriteFilesExec') -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu('DataWritingCommandExec,ExecutedCommandExec,WriteFilesExec', *non_utc_allow) def test_csv_save_as_table_fallback(spark_tmp_path, spark_tmp_table_factory): gen = TimestampGen() data_path = spark_tmp_path + '/CSV_DATA' @@ -563,14 +562,13 @@ def test_csv_read_count(spark_tmp_path): assert_gpu_and_cpu_row_counts_equal(lambda spark: spark.read.csv(data_path), conf = {'spark.rapids.sql.explain': 'ALL'}) -@allow_non_gpu('FileSourceScanExec', 'ProjectExec', 'CollectLimitExec', 'DeserializeToObjectExec') +@allow_non_gpu('FileSourceScanExec', 'ProjectExec', 'CollectLimitExec', 'DeserializeToObjectExec', *non_utc_allow) @pytest.mark.skipif(is_before_spark_341(), reason='`TIMESTAMP_NTZ` is only supported in PySpark 341+') @pytest.mark.parametrize('date_format', csv_supported_date_formats) @pytest.mark.parametrize('ts_part', csv_supported_ts_parts) @pytest.mark.parametrize("timestamp_type", [ pytest.param('TIMESTAMP_LTZ', marks=pytest.mark.xfail(is_spark_350_or_later(), reason="https://github.com/NVIDIA/spark-rapids/issues/9325")), "TIMESTAMP_NTZ"]) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_csv_infer_schema_timestamp_ntz_v1(spark_tmp_path, date_format, ts_part, timestamp_type): csv_infer_schema_timestamp_ntz(spark_tmp_path, date_format, ts_part, timestamp_type, 'csv', 'FileSourceScanExec') @@ -621,9 +619,9 @@ def do_read(spark): non_exist_classes = cpu_scan_class, conf = conf) -@allow_non_gpu('FileSourceScanExec', 'CollectLimitExec', 'DeserializeToObjectExec') +@allow_non_gpu('FileSourceScanExec', 'CollectLimitExec', 'DeserializeToObjectExec', *non_utc_allow) @pytest.mark.skipif(is_before_spark_340(), reason='`preferDate` is only supported in Spark 340+') -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') + def test_csv_prefer_date_with_infer_schema(spark_tmp_path): # start date ""0001-01-02" required due to: https://github.com/NVIDIA/spark-rapids/issues/5606 data_gens = [byte_gen, short_gen, int_gen, long_gen, boolean_gen, timestamp_gen, DateGen(start=date(1, 1, 2))] diff --git a/integration_tests/src/main/python/data_gen.py b/integration_tests/src/main/python/data_gen.py index 9aa5e547c45d..4a4a835610f3 100644 --- a/integration_tests/src/main/python/data_gen.py +++ b/integration_tests/src/main/python/data_gen.py @@ -24,7 +24,7 @@ from spark_session import is_before_spark_340, with_cpu_session import sre_yield import struct -from conftest import skip_unless_precommit_tests,get_datagen_seed +from conftest import skip_unless_precommit_tests,get_datagen_seed, is_not_utc import time import os from functools import lru_cache @@ -1172,3 +1172,7 @@ def get_25_partitions_df(spark): StructField("c3", IntegerType())]) data = [[i, j, k] for i in range(0, 5) for j in range(0, 5) for k in range(0, 100)] return spark.createDataFrame(data, schema) + + +# allow non gpu when time zone is non-UTC because of https://github.com/NVIDIA/spark-rapids/issues/9653' +non_utc_allow=['ProjectExec', 'FilterExec', 'FileSourceScanExec', 'BatchScanExec', 'CollectLimitExec', 'DeserializeToObjectExec', 'DataWritingCommandExec', 'WriteFilesExec'] if is_not_utc() else [] \ No newline at end of file diff --git a/integration_tests/src/main/python/datasourcev2_read_test.py b/integration_tests/src/main/python/datasourcev2_read_test.py index 4a25d618e7d6..b2134c50618d 100644 --- a/integration_tests/src/main/python/datasourcev2_read_test.py +++ b/integration_tests/src/main/python/datasourcev2_read_test.py @@ -16,6 +16,7 @@ from asserts import assert_gpu_and_cpu_are_equal_collect, assert_gpu_and_cpu_row_counts_equal from conftest import is_not_utc +from data_gen import * from marks import * columnarClass = 'com.nvidia.spark.rapids.tests.datasourcev2.parquet.ArrowColumnarDataSourceV2' @@ -31,7 +32,7 @@ def test_read_int(): assert_gpu_and_cpu_are_equal_collect(readTable("int", columnarClass)) @validate_execs_in_gpu_plan('HostColumnarToGpu') -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_read_strings(): assert_gpu_and_cpu_are_equal_collect(readTable("string", columnarClass)) diff --git a/integration_tests/src/main/python/date_time_test.py b/integration_tests/src/main/python/date_time_test.py index 8ce8d5c5e00a..d8047aa55694 100644 --- a/integration_tests/src/main/python/date_time_test.py +++ b/integration_tests/src/main/python/date_time_test.py @@ -27,7 +27,7 @@ (1885, -2828), (0, 2463), (932, 2286), (0, 0)] @pytest.mark.parametrize('data_gen', vals, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_timesub(data_gen): days, seconds = data_gen assert_gpu_and_cpu_are_equal_collect( @@ -36,7 +36,7 @@ def test_timesub(data_gen): .selectExpr("a - (interval {} days {} seconds)".format(days, seconds))) @pytest.mark.parametrize('data_gen', vals, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_timeadd(data_gen): days, seconds = data_gen assert_gpu_and_cpu_are_equal_collect( @@ -46,7 +46,7 @@ def test_timeadd(data_gen): .selectExpr("a + (interval {} days {} seconds)".format(days, seconds))) @pytest.mark.skipif(is_before_spark_330(), reason='DayTimeInterval is not supported before Pyspark 3.3.0') -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_timeadd_daytime_column(): gen_list = [ # timestamp column max year is 1000 @@ -64,7 +64,7 @@ def test_interval_seconds_overflow_exception(): error_message="IllegalArgumentException") @pytest.mark.parametrize('data_gen', vals, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_timeadd_from_subquery(data_gen): def fun(spark): @@ -76,7 +76,7 @@ def fun(spark): assert_gpu_and_cpu_are_equal_collect(fun) @pytest.mark.parametrize('data_gen', vals, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_timesub_from_subquery(data_gen): def fun(spark): @@ -92,7 +92,7 @@ def fun(spark): # [SPARK-34896][SQL] Return day-time interval from dates subtraction # 1. Add the SQL config `spark.sql.legacy.interval.enabled` which will control when Spark SQL should use `CalendarIntervalType` instead of ANSI intervals. @pytest.mark.parametrize('data_gen', vals, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_dateaddinterval(data_gen): days, seconds = data_gen assert_gpu_and_cpu_are_equal_collect( @@ -103,7 +103,7 @@ def test_dateaddinterval(data_gen): # test add days(not specify hours, minutes, seconds, milliseconds, microseconds) in ANSI mode. @pytest.mark.parametrize('data_gen', vals, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_dateaddinterval_ansi(data_gen): days, _ = data_gen # only specify the `days` @@ -131,17 +131,17 @@ def test_datediff(data_gen): 'datediff(a, date(null))', 'datediff(a, \'2016-03-02\')')) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_hour(): assert_gpu_and_cpu_are_equal_collect( lambda spark : unary_op_df(spark, timestamp_gen).selectExpr('hour(a)')) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_minute(): assert_gpu_and_cpu_are_equal_collect( lambda spark : unary_op_df(spark, timestamp_gen).selectExpr('minute(a)')) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_second(): assert_gpu_and_cpu_are_equal_collect( lambda spark : unary_op_df(spark, timestamp_gen).selectExpr('second(a)')) @@ -200,7 +200,7 @@ def test_datesub(data_gen): to_unix_timestamp_days_gen=[ByteGen(), ShortGen(), IntegerGen(min_val=-106032829, max_val=103819094, special_cases=[-106032829, 103819094,0,1,-1])] @pytest.mark.parametrize('data_gen', to_unix_timestamp_days_gen, ids=idfn) @incompat -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_dateadd_with_date_overflow(data_gen): string_type = to_cast_string(data_gen.data_type) assert_gpu_and_cpu_are_equal_collect( @@ -214,7 +214,7 @@ def test_dateadd_with_date_overflow(data_gen): to_unix_timestamp_days_gen=[ByteGen(), ShortGen(), IntegerGen(max_val=106032829, min_val=-103819094, special_cases=[106032829, -103819094,0,1,-1])] @pytest.mark.parametrize('data_gen', to_unix_timestamp_days_gen, ids=idfn) @incompat -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_datesub_with_date_overflow(data_gen): string_type = to_cast_string(data_gen.data_type) assert_gpu_and_cpu_are_equal_collect( @@ -246,7 +246,7 @@ def test_dayofyear(data_gen): lambda spark : unary_op_df(spark, data_gen).select(f.dayofyear(f.col('a')))) @pytest.mark.parametrize('data_gen', date_n_time_gens, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_unix_timestamp(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark : unary_op_df(spark, data_gen).select(f.unix_timestamp(f.col('a')))) @@ -263,7 +263,7 @@ def test_unsupported_fallback_unix_timestamp(data_gen): @pytest.mark.parametrize('ansi_enabled', [True, False], ids=['ANSI_ON', 'ANSI_OFF']) @pytest.mark.parametrize('data_gen', date_n_time_gens, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_to_unix_timestamp(data_gen, ansi_enabled): assert_gpu_and_cpu_are_equal_collect( lambda spark : unary_op_df(spark, data_gen).selectExpr("to_unix_timestamp(a)"), @@ -281,7 +281,7 @@ def test_unsupported_fallback_to_unix_timestamp(data_gen): @pytest.mark.parametrize('time_zone', ["UTC", "UTC+0", "UTC-0", "GMT", "GMT+0", "GMT-0"], ids=idfn) @pytest.mark.parametrize('data_gen', [timestamp_gen], ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_from_utc_timestamp(data_gen, time_zone): assert_gpu_and_cpu_are_equal_collect( lambda spark: unary_op_df(spark, data_gen).select(f.from_utc_timestamp(f.col('a'), time_zone))) @@ -297,7 +297,7 @@ def test_from_utc_timestamp_unsupported_timezone_fallback(data_gen, time_zone): @pytest.mark.parametrize('time_zone', ["UTC", "Asia/Shanghai", "EST", "MST", "VST"], ids=idfn) @pytest.mark.parametrize('data_gen', [timestamp_gen], ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_from_utc_timestamp_supported_timezones(data_gen, time_zone): # Remove spark.rapids.test.CPU.timezone configuration when GPU kernel is ready to really test on GPU assert_gpu_and_cpu_are_equal_collect( @@ -355,7 +355,7 @@ def fun(spark): @pytest.mark.parametrize('parser_policy', ["CORRECTED", "EXCEPTION"], ids=idfn) # first get expected string via `date_format` -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_string_to_timestamp_functions_ansi_valid(parser_policy): expr_format = "{operator}(date_format(a, '{fmt}'), '{fmt}')" formats = ['yyyy-MM-dd', 'yyyy/MM/dd', 'yyyy-MM', 'yyyy/MM', 'dd/MM/yyyy', 'yyyy-MM-dd HH:mm:ss', @@ -373,7 +373,7 @@ def fun(spark): @pytest.mark.parametrize('ansi_enabled', [True, False], ids=['ANSI_ON', 'ANSI_OFF']) @pytest.mark.parametrize('data_gen', date_n_time_gens, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_unix_timestamp_improved(data_gen, ansi_enabled): conf = {"spark.rapids.sql.improvedTimeOps.enabled": "true", "spark.sql.legacy.timeParserPolicy": "CORRECTED"} @@ -383,7 +383,7 @@ def test_unix_timestamp_improved(data_gen, ansi_enabled): @pytest.mark.parametrize('ansi_enabled', [True, False], ids=['ANSI_ON', 'ANSI_OFF']) @pytest.mark.parametrize('data_gen', date_n_time_gens, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_unix_timestamp(data_gen, ansi_enabled): assert_gpu_and_cpu_are_equal_collect( lambda spark : unary_op_df(spark, data_gen).select(f.unix_timestamp(f.col("a"))), @@ -391,7 +391,7 @@ def test_unix_timestamp(data_gen, ansi_enabled): @pytest.mark.parametrize('ansi_enabled', [True, False], ids=['ANSI_ON', 'ANSI_OFF']) @pytest.mark.parametrize('data_gen', date_n_time_gens, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_to_unix_timestamp_improved(data_gen, ansi_enabled): conf = {"spark.rapids.sql.improvedTimeOps.enabled": "true"} assert_gpu_and_cpu_are_equal_collect( @@ -410,7 +410,7 @@ def invalid_date_string_df(spark): @pytest.mark.parametrize('ansi_enabled', [True, False], ids=['ANSI_ON', 'ANSI_OFF']) @pytest.mark.parametrize('data_gen,date_form', str_date_and_format_gen, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_string_to_unix_timestamp(data_gen, date_form, ansi_enabled): assert_gpu_and_cpu_are_equal_collect( lambda spark : unary_op_df(spark, data_gen, seed=1).selectExpr("to_unix_timestamp(a, '{}')".format(date_form)), @@ -424,7 +424,7 @@ def test_string_to_unix_timestamp_ansi_exception(): @pytest.mark.parametrize('ansi_enabled', [True, False], ids=['ANSI_ON', 'ANSI_OFF']) @pytest.mark.parametrize('data_gen,date_form', str_date_and_format_gen, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_string_unix_timestamp(data_gen, date_form, ansi_enabled): assert_gpu_and_cpu_are_equal_collect( lambda spark : unary_op_df(spark, data_gen, seed=1).select(f.unix_timestamp(f.col('a'), date_form)), @@ -438,7 +438,7 @@ def test_string_unix_timestamp_ansi_exception(): @pytest.mark.parametrize('data_gen', [StringGen('200[0-9]-0[1-9]-[0-2][1-8]')], ids=idfn) @pytest.mark.parametrize('ansi_enabled', [True, False], ids=['ANSI_ON', 'ANSI_OFF']) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_gettimestamp(data_gen, ansi_enabled): assert_gpu_and_cpu_are_equal_collect( lambda spark : unary_op_df(spark, data_gen).select(f.to_date(f.col("a"), "yyyy-MM-dd")), @@ -446,7 +446,7 @@ def test_gettimestamp(data_gen, ansi_enabled): @pytest.mark.parametrize('data_gen', [StringGen('0[1-9]200[0-9]')], ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_gettimestamp_format_MMyyyy(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark: unary_op_df(spark, data_gen).select(f.to_date(f.col("a"), "MMyyyy"))) @@ -462,7 +462,7 @@ def test_gettimestamp_ansi_exception(): @pytest.mark.parametrize('date_format', supported_date_formats, ids=idfn) @pytest.mark.parametrize('data_gen', date_n_time_gens, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_date_format(data_gen, date_format): assert_gpu_and_cpu_are_equal_collect( lambda spark : unary_op_df(spark, data_gen).selectExpr("date_format(a, '{}')".format(date_format))) @@ -497,7 +497,7 @@ def test_date_format_maybe(data_gen, date_format): @pytest.mark.parametrize('date_format', maybe_supported_date_formats, ids=idfn) @pytest.mark.parametrize('data_gen', date_n_time_gens, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_date_format_maybe_incompat(data_gen, date_format): conf = {"spark.rapids.sql.incompatibleDateFormats.enabled": "true"} assert_gpu_and_cpu_are_equal_collect( @@ -509,7 +509,7 @@ def test_date_format_maybe_incompat(data_gen, date_format): # input_file_name(), otherwise filter happens before project. @allow_non_gpu('CollectLimitExec,FileSourceScanExec,DeserializeToObjectExec') @ignore_order() -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_date_format_mmyyyy_cast_canonicalization(spark_tmp_path): data_path = spark_tmp_path + '/CSV_DATA' gen = StringGen(pattern='[0][0-9][1][8-9][1-9][1-9]', nullable=False) @@ -555,12 +555,12 @@ def test_unsupported_fallback_to_date(): seconds_gens = [LongGen(min_val=-62135510400, max_val=253402214400), IntegerGen(), ShortGen(), ByteGen(), DoubleGen(min_exp=0, max_exp=32), ts_float_gen, DecimalGen(16, 6), DecimalGen(13, 3), DecimalGen(10, 0), DecimalGen(7, -3), DecimalGen(6, 6)] @pytest.mark.parametrize('data_gen', seconds_gens, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_timestamp_seconds(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark : unary_op_df(spark, data_gen).selectExpr("timestamp_seconds(a)")) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_timestamp_seconds_long_overflow(): assert_gpu_and_cpu_error( lambda spark : unary_op_df(spark, long_gen).selectExpr("timestamp_seconds(a)").collect(), @@ -568,7 +568,7 @@ def test_timestamp_seconds_long_overflow(): error_message='long overflow') @pytest.mark.parametrize('data_gen', [DecimalGen(7, 7), DecimalGen(20, 7)], ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_timestamp_seconds_rounding_necessary(data_gen): assert_gpu_and_cpu_error( lambda spark : unary_op_df(spark, data_gen).selectExpr("timestamp_seconds(a)").collect(), @@ -576,7 +576,7 @@ def test_timestamp_seconds_rounding_necessary(data_gen): error_message='Rounding necessary') @pytest.mark.parametrize('data_gen', [DecimalGen(19, 6), DecimalGen(20, 6)], ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_timestamp_seconds_decimal_overflow(data_gen): assert_gpu_and_cpu_error( lambda spark : unary_op_df(spark, data_gen).selectExpr("timestamp_seconds(a)").collect(), @@ -585,12 +585,12 @@ def test_timestamp_seconds_decimal_overflow(data_gen): millis_gens = [LongGen(min_val=-62135510400000, max_val=253402214400000), IntegerGen(), ShortGen(), ByteGen()] @pytest.mark.parametrize('data_gen', millis_gens, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_timestamp_millis(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark : unary_op_df(spark, data_gen).selectExpr("timestamp_millis(a)")) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_timestamp_millis_long_overflow(): assert_gpu_and_cpu_error( lambda spark : unary_op_df(spark, long_gen).selectExpr("timestamp_millis(a)").collect(), @@ -599,7 +599,7 @@ def test_timestamp_millis_long_overflow(): micros_gens = [LongGen(min_val=-62135510400000000, max_val=253402214400000000), IntegerGen(), ShortGen(), ByteGen()] @pytest.mark.parametrize('data_gen', micros_gens, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_timestamp_micros(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark : unary_op_df(spark, data_gen).selectExpr("timestamp_micros(a)")) diff --git a/integration_tests/src/main/python/explain_test.py b/integration_tests/src/main/python/explain_test.py index 1837f31aa950..d182c0938501 100644 --- a/integration_tests/src/main/python/explain_test.py +++ b/integration_tests/src/main/python/explain_test.py @@ -50,7 +50,7 @@ def do_join_explain(spark): with_cpu_session(do_join_explain) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_explain_set_config(): conf = {'spark.rapids.sql.hasExtendedYearValues': 'false', 'spark.rapids.sql.castStringToTimestamp.enabled': 'true'} diff --git a/integration_tests/src/main/python/generate_expr_test.py b/integration_tests/src/main/python/generate_expr_test.py index acc3e125ee6c..7e8c5ced3995 100644 --- a/integration_tests/src/main/python/generate_expr_test.py +++ b/integration_tests/src/main/python/generate_expr_test.py @@ -115,7 +115,7 @@ def test_explode_outer_map_data(map_gen): # After 3.1.0 is the min spark version we can drop this @ignore_order(local=True) @pytest.mark.parametrize('data_gen', explode_gens, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_explode_outer_nested_array_data(data_gen): data_gen = [int_gen, ArrayGen(ArrayGen(data_gen))] assert_gpu_and_cpu_are_equal_collect( @@ -127,7 +127,7 @@ def test_explode_outer_nested_array_data(data_gen): # After 3.1.0 is the min spark version we can drop this @ignore_order(local=True) @pytest.mark.parametrize('data_gen', explode_gens, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_posexplode_makearray(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark : four_op_df(spark, data_gen).selectExpr('posexplode(array(b, c, d))', 'a')) @@ -136,7 +136,7 @@ def test_posexplode_makearray(data_gen): # After 3.1.0 is the min spark version we can drop this @ignore_order(local=True) @pytest.mark.parametrize('data_gen', explode_gens, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_posexplode_litarray(data_gen): array_lit = with_cpu_session( lambda spark: gen_scalar(ArrayGen(data_gen, min_length=3, max_length=3, nullable=False))) @@ -151,7 +151,7 @@ def test_posexplode_litarray(data_gen): @pytest.mark.parametrize('data_gen', explode_gens + struct_gens_sample_with_decimal128 + array_gens_sample + arrays_with_binary + map_gens_sample + maps_with_binary, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_posexplode_array_data(data_gen): data_gen = [int_gen, ArrayGen(data_gen)] assert_gpu_and_cpu_are_equal_collect( @@ -162,7 +162,7 @@ def test_posexplode_array_data(data_gen): # After 3.1.0 is the min spark version we can drop this @ignore_order(local=True) @pytest.mark.parametrize('map_gen', map_gens_sample + decimal_128_map_gens + maps_with_binary, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_posexplode_map_data(map_gen): data_gen = [int_gen, map_gen] assert_gpu_and_cpu_are_equal_collect( @@ -173,7 +173,7 @@ def test_posexplode_map_data(map_gen): # After 3.1.0 is the min spark version we can drop this @ignore_order(local=True) @pytest.mark.parametrize('data_gen', explode_gens, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_posexplode_nested_array_data(data_gen): data_gen = [int_gen, ArrayGen(ArrayGen(data_gen))] assert_gpu_and_cpu_are_equal_collect( @@ -188,7 +188,7 @@ def test_posexplode_nested_array_data(data_gen): @pytest.mark.parametrize('data_gen', explode_gens + struct_gens_sample_with_decimal128 + array_gens_sample + arrays_with_binary + map_gens_sample + maps_with_binary, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_posexplode_outer_array_data(data_gen): data_gen = [int_gen, ArrayGen(data_gen)] assert_gpu_and_cpu_are_equal_collect( @@ -199,7 +199,7 @@ def test_posexplode_outer_array_data(data_gen): # After 3.1.0 is the min spark version we can drop this @ignore_order(local=True) @pytest.mark.parametrize('map_gen', map_gens_sample + decimal_128_map_gens + maps_with_binary, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_posexplode_outer_map_data(map_gen): data_gen = [int_gen, map_gen] assert_gpu_and_cpu_are_equal_collect( @@ -210,7 +210,7 @@ def test_posexplode_outer_map_data(map_gen): # After 3.1.0 is the min spark version we can drop this @ignore_order(local=True) @pytest.mark.parametrize('data_gen', explode_gens, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_posexplode_nested_outer_array_data(data_gen): data_gen = [int_gen, ArrayGen(ArrayGen(data_gen))] assert_gpu_and_cpu_are_equal_collect( @@ -235,7 +235,7 @@ def test_stack(): # gpu stack not guarantee to produce the same output order as Spark does @ignore_order(local=True) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_stack_mixed_types(): base_gens = [byte_gen, short_gen, int_gen, long_gen, float_gen, double_gen, string_gen, boolean_gen, date_gen, timestamp_gen, null_gen, DecimalGen(precision=7, scale=3), diff --git a/integration_tests/src/main/python/hash_aggregate_test.py b/integration_tests/src/main/python/hash_aggregate_test.py index 03422e3f4bc3..a6f1bb017fe1 100644 --- a/integration_tests/src/main/python/hash_aggregate_test.py +++ b/integration_tests/src/main/python/hash_aggregate_test.py @@ -631,7 +631,7 @@ def test_min_max_group_by(data_gen): @ignore_order(local=True) @pytest.mark.parametrize('data_gen', _gen_data_for_collect_list_op, ids=idfn) @pytest.mark.parametrize('use_obj_hash_agg', [True, False], ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_hash_groupby_collect_list(data_gen, use_obj_hash_agg): def doit(spark): df = gen_df(spark, data_gen, length=100)\ @@ -663,7 +663,7 @@ def doit(spark): @ignore_order(local=True) @pytest.mark.parametrize('data_gen', _full_gen_data_for_collect_op, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_hash_groupby_collect_set(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark: gen_df(spark, data_gen, length=100) @@ -672,7 +672,7 @@ def test_hash_groupby_collect_set(data_gen): @ignore_order(local=True) @pytest.mark.parametrize('data_gen', _gen_data_for_collect_set_op, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_hash_groupby_collect_set_on_nested_type(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark: gen_df(spark, data_gen, length=100) @@ -687,7 +687,7 @@ def test_hash_groupby_collect_set_on_nested_type(data_gen): @ignore_order(local=True) @allow_non_gpu("ProjectExec", "SortArray") @pytest.mark.parametrize('data_gen', _gen_data_for_collect_set_op_nested, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_hash_groupby_collect_set_on_nested_array_type(data_gen): conf = copy_and_update(_float_conf, { "spark.rapids.sql.castFloatToString.enabled": "true", @@ -709,7 +709,7 @@ def do_it(spark): @ignore_order(local=True) @pytest.mark.parametrize('data_gen', _full_gen_data_for_collect_op, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_hash_reduction_collect_set(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark: gen_df(spark, data_gen, length=100) @@ -717,7 +717,7 @@ def test_hash_reduction_collect_set(data_gen): @ignore_order(local=True) @pytest.mark.parametrize('data_gen', _gen_data_for_collect_set_op, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_hash_reduction_collect_set_on_nested_type(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark: gen_df(spark, data_gen, length=100) @@ -731,7 +731,7 @@ def test_hash_reduction_collect_set_on_nested_type(data_gen): @ignore_order(local=True) @allow_non_gpu("ProjectExec", "SortArray") @pytest.mark.parametrize('data_gen', _gen_data_for_collect_set_op_nested, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_hash_reduction_collect_set_on_nested_array_type(data_gen): conf = copy_and_update(_float_conf, { "spark.rapids.sql.castFloatToString.enabled": "true", @@ -751,7 +751,7 @@ def do_it(spark): @ignore_order(local=True) @pytest.mark.parametrize('data_gen', _full_gen_data_for_collect_op, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_hash_groupby_collect_with_single_distinct(data_gen): # test collect_ops with other distinct aggregations assert_gpu_and_cpu_are_equal_collect( @@ -764,7 +764,7 @@ def test_hash_groupby_collect_with_single_distinct(data_gen): @ignore_order(local=True) @pytest.mark.parametrize('data_gen', _gen_data_for_collect_op, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_hash_groupby_single_distinct_collect(data_gen): # test distinct collect sql = """select a, @@ -788,7 +788,7 @@ def test_hash_groupby_single_distinct_collect(data_gen): @ignore_order(local=True) @pytest.mark.parametrize('data_gen', _gen_data_for_collect_op, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_hash_groupby_collect_with_multi_distinct(data_gen): def spark_fn(spark_session): return gen_df(spark_session, data_gen, length=100).groupby('a').agg( @@ -815,7 +815,7 @@ def spark_fn(spark_session): @pytest.mark.parametrize('replace_mode', _replace_modes_non_distinct, ids=idfn) @pytest.mark.parametrize('aqe_enabled', ['false', 'true'], ids=idfn) @pytest.mark.parametrize('use_obj_hash_agg', ['false', 'true'], ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_hash_groupby_collect_partial_replace_fallback(data_gen, replace_mode, aqe_enabled, @@ -863,7 +863,7 @@ def test_hash_groupby_collect_partial_replace_fallback(data_gen, @pytest.mark.parametrize('aqe_enabled', ['false', 'true'], ids=idfn) @pytest.mark.parametrize('use_obj_hash_agg', ['false', 'true'], ids=idfn) @pytest.mark.xfail(condition=is_databricks104_or_later(), reason='https://github.com/NVIDIA/spark-rapids/issues/4963') -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_hash_groupby_collect_partial_replace_with_distinct_fallback(data_gen, replace_mode, aqe_enabled, @@ -1262,7 +1262,7 @@ def test_first_last_reductions_decimal_types(data_gen): 'first(a)', 'last(a)', 'first(a, true)', 'last(a, true)')) @pytest.mark.parametrize('data_gen', _nested_gens, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_first_last_reductions_nested_types(data_gen): assert_gpu_and_cpu_are_equal_collect( # Coalesce and sort are to make sure that first and last, which are non-deterministic @@ -1271,7 +1271,7 @@ def test_first_last_reductions_nested_types(data_gen): 'first(a)', 'last(a)', 'first(a, true)', 'last(a, true)')) @pytest.mark.parametrize('data_gen', _all_basic_gens_with_all_nans_cases, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_generic_reductions(data_gen): local_conf = copy_and_update(_float_conf, {'spark.sql.legacy.allowParameterlessCount': 'true'}) assert_gpu_and_cpu_are_equal_collect( @@ -1289,7 +1289,7 @@ def test_generic_reductions(data_gen): conf=local_conf) @pytest.mark.parametrize('data_gen', all_gen + _nested_gens, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_count(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark : unary_op_df(spark, data_gen) \ @@ -1301,7 +1301,7 @@ def test_count(data_gen): conf = {'spark.sql.legacy.allowParameterlessCount': 'true'}) @pytest.mark.parametrize('data_gen', all_basic_gens, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_distinct_count_reductions(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark : binary_op_df(spark, data_gen).selectExpr( @@ -1325,7 +1325,7 @@ def test_arithmetic_reductions(data_gen): @pytest.mark.parametrize('data_gen', all_basic_gens + decimal_gens + _nested_gens, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_collect_list_reductions(data_gen): assert_gpu_and_cpu_are_equal_collect( # coalescing because collect_list is not deterministic @@ -1344,7 +1344,7 @@ def test_collect_list_reductions(data_gen): @pytest.mark.parametrize('data_gen', _no_neg_zero_all_basic_gens + decimal_gens + _struct_only_nested_gens, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_collect_set_reductions(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark: unary_op_df(spark, data_gen).selectExpr('sort_array(collect_set(a))'), @@ -1358,7 +1358,7 @@ def test_collect_empty(): @ignore_order(local=True) @pytest.mark.parametrize('data_gen', all_gen + _nested_gens, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_groupby_first_last(data_gen): gen_fn = [('a', RepeatSeqGen(LongGen(), length=20)), ('b', data_gen)] agg_fn = lambda df: df.groupBy('a').agg( @@ -1372,7 +1372,7 @@ def test_groupby_first_last(data_gen): @ignore_order(local=True) @pytest.mark.parametrize('data_gen', all_gen + _struct_only_nested_gens, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_sorted_groupby_first_last(data_gen): gen_fn = [('a', RepeatSeqGen(LongGen(), length=20)), ('b', data_gen)] # sort by more than the group by columns to be sure that first/last don't remove the ordering @@ -1390,7 +1390,7 @@ def test_sorted_groupby_first_last(data_gen): @ignore_order(local=True) @pytest.mark.parametrize('data_gen', all_gen, ids=idfn) @pytest.mark.parametrize('count_func', [f.count, f.countDistinct]) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_agg_count(data_gen, count_func): assert_gpu_and_cpu_are_equal_collect( lambda spark : gen_df(spark, [('a', data_gen), ('b', data_gen)], @@ -2047,7 +2047,7 @@ def test_std_variance_partial_replace_fallback(data_gen, null_gen] + array_gens_sample + struct_gens_sample @ignore_order(local=True) @pytest.mark.parametrize('data_gen', gens_for_max_min, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_min_max_in_groupby_and_reduction(data_gen): df_gen = [('a', data_gen), ('b', RepeatSeqGen(IntegerGen(), length=20))] diff --git a/integration_tests/src/main/python/hive_delimited_text_test.py b/integration_tests/src/main/python/hive_delimited_text_test.py index 4d07a077ec09..78c77391c31b 100644 --- a/integration_tests/src/main/python/hive_delimited_text_test.py +++ b/integration_tests/src/main/python/hive_delimited_text_test.py @@ -187,7 +187,7 @@ def read_impl(spark): ('hive-delim-text/carriage-return', StructType([StructField("str", StringType())]), {}), ('hive-delim-text/carriage-return-err', StructType([StructField("str", StringType())]), {}), ], ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_basic_hive_text_read(std_input_path, name, schema, spark_tmp_table_factory, options): assert_gpu_and_cpu_are_equal_collect(read_hive_text_sql(std_input_path + '/' + name, schema, spark_tmp_table_factory, options), @@ -240,7 +240,7 @@ def read_hive_text_table(spark, text_table_name, fields="my_field"): "https://github.com/NVIDIA/spark-rapids/pull/7628") @approximate_float @pytest.mark.parametrize('data_gen', hive_text_supported_gens, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_hive_text_round_trip(spark_tmp_path, data_gen, spark_tmp_table_factory): gen = StructGen([('my_field', data_gen)], nullable=False) data_path = spark_tmp_path + '/hive_text_table' @@ -282,9 +282,8 @@ def read_hive_text_table_partitions(spark, text_table_name, partition): reason="Hive text reads are disabled on CDH, as per " "https://github.com/NVIDIA/spark-rapids/pull/7628") @approximate_float -@allow_non_gpu("EqualTo,IsNotNull,Literal") # Accounts for partition predicate: `WHERE dt='1'` +@allow_non_gpu("EqualTo,IsNotNull,Literal", *non_utc_allow) # Accounts for partition predicate: `WHERE dt='1'` @pytest.mark.parametrize('data_gen', hive_text_supported_gens, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_hive_text_round_trip_partitioned(spark_tmp_path, data_gen, spark_tmp_table_factory): gen = StructGen([('my_field', data_gen)], nullable=False) data_path = spark_tmp_path + '/hive_text_table' @@ -301,9 +300,8 @@ def test_hive_text_round_trip_partitioned(spark_tmp_path, data_gen, spark_tmp_ta reason="Hive text reads are disabled on CDH, as per " "https://github.com/NVIDIA/spark-rapids/pull/7628") @approximate_float -@allow_non_gpu("EqualTo,IsNotNull,Literal,Or") # Accounts for partition predicate +@allow_non_gpu("EqualTo,IsNotNull,Literal,Or", *non_utc_allow) # Accounts for partition predicate @pytest.mark.parametrize('data_gen', hive_text_supported_gens, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_hive_text_round_trip_two_partitions(spark_tmp_path, data_gen, spark_tmp_table_factory): """ Added to reproduce: https://github.com/NVIDIA/spark-rapids/issues/7383 @@ -529,7 +527,7 @@ def create_table_with_compressed_files(spark): ('hive-delim-text/carriage-return', StructType([StructField("str", StringType())]), {}), ('hive-delim-text/carriage-return-err', StructType([StructField("str", StringType())]), {}), ], ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_basic_hive_text_write(std_input_path, input_dir, schema, spark_tmp_table_factory, mode, options): # Configure table options, including schema. if options is None: diff --git a/integration_tests/src/main/python/hive_write_test.py b/integration_tests/src/main/python/hive_write_test.py index 7bc5ceede855..ae7052dffd73 100644 --- a/integration_tests/src/main/python/hive_write_test.py +++ b/integration_tests/src/main/python/hive_write_test.py @@ -59,7 +59,7 @@ def _restricted_timestamp(nullable=True): @pytest.mark.skipif(not is_hive_available(), reason="Hive is missing") @pytest.mark.parametrize("gens", _write_gens, ids=idfn) @pytest.mark.parametrize("storage", ["PARQUET", "nativeorc", "hiveorc"]) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_optimized_hive_ctas_basic(gens, storage, spark_tmp_table_factory): data_table = spark_tmp_table_factory.get() gen_list = [('c' + str(i), gen) for i, gen in enumerate(gens)] diff --git a/integration_tests/src/main/python/join_test.py b/integration_tests/src/main/python/join_test.py index 9ea78a854011..6660e663c929 100644 --- a/integration_tests/src/main/python/join_test.py +++ b/integration_tests/src/main/python/join_test.py @@ -189,11 +189,11 @@ def do_join(spark): # For floating point values the normalization is done using a higher order function. We could probably work around this # for now it falls back to the CPU @allow_non_gpu('SortMergeJoinExec', 'SortExec', 'ArrayTransform', 'LambdaFunction', - 'NamedLambdaVariable', 'NormalizeNaNAndZero', 'ShuffleExchangeExec', 'HashPartitioning') + 'NamedLambdaVariable', 'NormalizeNaNAndZero', 'ShuffleExchangeExec', 'HashPartitioning', + *non_utc_allow) @ignore_order(local=True) @pytest.mark.parametrize('data_gen', single_level_array_gens + [binary_gen], ids=idfn) @pytest.mark.parametrize('join_type', all_join_types, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_sortmerge_join_wrong_key_fallback(data_gen, join_type): def do_join(spark): left, right = create_df(spark, data_gen, 500, 500) @@ -213,7 +213,7 @@ def do_join(spark): @pytest.mark.parametrize('data_gen', basic_nested_gens + [decimal_gen_128bit], ids=idfn) @pytest.mark.parametrize('join_type', all_join_types, ids=idfn) @pytest.mark.parametrize('sub_part_enabled', ['false', 'true'], ids=['SubPartition_OFF', 'SubPartition_ON']) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_hash_join_ridealong(data_gen, join_type, sub_part_enabled): def do_join(spark): left, right = create_ridealong_df(spark, short_gen, data_gen, 50, 500) @@ -230,7 +230,7 @@ def do_join(spark): # Not all join types can be translated to a broadcast join, but this tests them to be sure we # can handle what spark is doing @pytest.mark.parametrize('join_type', all_join_types, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_broadcast_join_right_table(data_gen, join_type): def do_join(spark): left, right = create_df(spark, data_gen, 500, 250) @@ -242,7 +242,7 @@ def do_join(spark): # Not all join types can be translated to a broadcast join, but this tests them to be sure we # can handle what spark is doing @pytest.mark.parametrize('join_type', all_join_types, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_broadcast_join_right_table_ridealong(data_gen, join_type): def do_join(spark): left, right = create_ridealong_df(spark, short_gen, data_gen, 500, 500) @@ -256,7 +256,7 @@ def do_join(spark): # Not all join types can be translated to a broadcast join, but this tests them to be sure we # can handle what spark is doing @pytest.mark.parametrize('join_type', all_join_types, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_broadcast_join_right_table_with_job_group(data_gen, join_type): with_cpu_session(lambda spark : spark.sparkContext.setJobGroup("testjob1", "test", False)) def do_join(spark): @@ -271,7 +271,7 @@ def do_join(spark): @pytest.mark.parametrize('data_gen,batch_size', join_batch_size_test_params( (all_gen + basic_nested_gens, '1g'), (join_small_batch_gens + [basic_struct_gen, ArrayGen(string_gen)], '100')), ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_cartesian_join(data_gen, batch_size): def do_join(spark): left, right = create_df(spark, data_gen, 50, 25) @@ -311,7 +311,7 @@ def do_join(spark): @pytest.mark.parametrize('data_gen,batch_size', join_batch_size_test_params( (all_gen, '1g'), (join_small_batch_gens, '100')), ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_cartesian_join_with_condition(data_gen, batch_size): def do_join(spark): left, right = create_df(spark, data_gen, 50, 25) @@ -329,7 +329,7 @@ def do_join(spark): @pytest.mark.parametrize('data_gen,batch_size', join_batch_size_test_params( (all_gen + basic_nested_gens, '1g'), (join_small_batch_gens, '100')), ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_broadcast_nested_loop_join(data_gen, batch_size): def do_join(spark): left, right = create_df(spark, data_gen, 50, 25) @@ -365,7 +365,7 @@ def do_join(spark): (join_ast_gen, '1g'), ([int_gen], 100)), ids=idfn) @pytest.mark.parametrize('join_type', ['Left', 'Inner', 'LeftSemi', 'LeftAnti', 'Cross'], ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_right_broadcast_nested_loop_join_with_ast_condition(data_gen, join_type, batch_size): def do_join(spark): left, right = create_df(spark, data_gen, 50, 25) @@ -380,7 +380,7 @@ def do_join(spark): # After 3.1.0 is the min spark version we can drop this @ignore_order(local=True) @pytest.mark.parametrize('data_gen', join_ast_gen, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_left_broadcast_nested_loop_join_with_ast_condition(data_gen): def do_join(spark): left, right = create_df(spark, data_gen, 50, 25) @@ -439,7 +439,7 @@ def do_join(spark): float_gen, double_gen, string_gen, boolean_gen, date_gen, timestamp_gen], ids=idfn) @pytest.mark.parametrize('join_type', ['Left', 'Right', 'FullOuter', 'LeftSemi', 'LeftAnti'], ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_broadcast_nested_loop_join_with_array_contains(data_gen, join_type): arr_gen = ArrayGen(data_gen) literal = with_cpu_session(lambda spark: gen_scalar(data_gen)) @@ -452,7 +452,7 @@ def do_join(spark): @ignore_order(local=True) @pytest.mark.parametrize('data_gen', all_gen, ids=idfn) @pytest.mark.parametrize('join_type', ['Left', 'LeftSemi', 'LeftAnti'], ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_right_broadcast_nested_loop_join_condition_missing(data_gen, join_type): def do_join(spark): left, right = create_df(spark, data_gen, 50, 25) @@ -468,7 +468,7 @@ def do_join(spark): @ignore_order(local=True) @pytest.mark.parametrize('data_gen', all_gen, ids=idfn) @pytest.mark.parametrize('join_type', ['Right'], ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_left_broadcast_nested_loop_join_condition_missing(data_gen, join_type): def do_join(spark): left, right = create_df(spark, data_gen, 50, 25) @@ -483,7 +483,7 @@ def do_join(spark): @pytest.mark.parametrize('data_gen', all_gen + single_level_array_gens + [binary_gen], ids=idfn) @pytest.mark.parametrize('join_type', ['Left', 'LeftSemi', 'LeftAnti'], ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_right_broadcast_nested_loop_join_condition_missing_count(data_gen, join_type): def do_join(spark): left, right = create_df(spark, data_gen, 50, 25) @@ -492,29 +492,27 @@ def do_join(spark): @pytest.mark.parametrize('data_gen', all_gen + single_level_array_gens + [binary_gen], ids=idfn) @pytest.mark.parametrize('join_type', ['Right'], ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_left_broadcast_nested_loop_join_condition_missing_count(data_gen, join_type): def do_join(spark): left, right = create_df(spark, data_gen, 50, 25) return broadcast(left).join(right, how=join_type).selectExpr('COUNT(*)') assert_gpu_and_cpu_are_equal_collect(do_join) -@allow_non_gpu('BroadcastExchangeExec', 'BroadcastNestedLoopJoinExec', 'GreaterThanOrEqual') +@allow_non_gpu('BroadcastExchangeExec', 'BroadcastNestedLoopJoinExec', 'GreaterThanOrEqual', *non_utc_allow) @ignore_order(local=True) @pytest.mark.parametrize('data_gen', all_gen, ids=idfn) @pytest.mark.parametrize('join_type', ['LeftOuter', 'LeftSemi', 'LeftAnti', 'FullOuter'], ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_broadcast_nested_loop_join_with_conditionals_build_left_fallback(data_gen, join_type): def do_join(spark): left, right = create_df(spark, data_gen, 50, 25) return broadcast(left).join(right, (left.b >= right.r_b), join_type) assert_gpu_fallback_collect(do_join, 'BroadcastNestedLoopJoinExec') -@allow_non_gpu('BroadcastExchangeExec', 'BroadcastNestedLoopJoinExec', 'GreaterThanOrEqual') +@allow_non_gpu('BroadcastExchangeExec', 'BroadcastNestedLoopJoinExec', 'GreaterThanOrEqual', *non_utc_allow) @ignore_order(local=True) @pytest.mark.parametrize('data_gen', all_gen, ids=idfn) @pytest.mark.parametrize('join_type', ['RightOuter', 'FullOuter'], ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_broadcast_nested_loop_with_conditionals_build_right_fallback(data_gen, join_type): def do_join(spark): left, right = create_df(spark, data_gen, 50, 25) @@ -531,7 +529,7 @@ def do_join(spark): # Specify 200 shuffle partitions to test cases where streaming side is empty # as in https://github.com/NVIDIA/spark-rapids/issues/7516 @pytest.mark.parametrize('shuffle_conf', [{}, {'spark.sql.shuffle.partitions': 200}], ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_broadcast_join_left_table(data_gen, join_type, shuffle_conf): def do_join(spark): left, right = create_df(spark, data_gen, 250, 500) @@ -543,7 +541,7 @@ def do_join(spark): @ignore_order(local=True) @pytest.mark.parametrize('data_gen', join_ast_gen, ids=idfn) @pytest.mark.parametrize('join_type', all_join_types, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_broadcast_join_with_conditionals(data_gen, join_type): def do_join(spark): left, right = create_df(spark, data_gen, 500, 250) @@ -598,7 +596,7 @@ def do_join(spark): @ignore_order(local=True) @pytest.mark.parametrize('data_gen', join_ast_gen, ids=idfn) @pytest.mark.parametrize('join_type', ['Left', 'Right', 'Inner', 'FullOuter', 'LeftSemi', 'LeftAnti'], ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_sortmerge_join_with_condition_ast(data_gen, join_type): def do_join(spark): left, right = create_df(spark, data_gen, 500, 250) @@ -715,7 +713,7 @@ def do_join(spark): @ignore_order(local=True) @pytest.mark.parametrize('data_gen', struct_gens, ids=idfn) @pytest.mark.parametrize('join_type', ['Inner', 'Left', 'Right', 'Cross', 'LeftSemi', 'LeftAnti'], ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_sortmerge_join_struct_as_key(data_gen, join_type): def do_join(spark): left, right = create_df(spark, data_gen, 500, 250) @@ -727,7 +725,7 @@ def do_join(spark): @ignore_order(local=True) @pytest.mark.parametrize('data_gen', struct_gens, ids=idfn) @pytest.mark.parametrize('join_type', ['Inner', 'Left', 'Right', 'Cross', 'LeftSemi', 'LeftAnti'], ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_sortmerge_join_struct_mixed_key(data_gen, join_type): def do_join(spark): left = two_col_df(spark, data_gen, int_gen, length=500) @@ -740,7 +738,7 @@ def do_join(spark): @ignore_order(local=True) @pytest.mark.parametrize('data_gen', struct_gens, ids=idfn) @pytest.mark.parametrize('join_type', ['Inner', 'Left', 'Right', 'Cross', 'LeftSemi', 'LeftAnti'], ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_sortmerge_join_struct_mixed_key_with_null_filter(data_gen, join_type): def do_join(spark): left = two_col_df(spark, data_gen, int_gen, length=500) @@ -755,7 +753,7 @@ def do_join(spark): @ignore_order(local=True) @pytest.mark.parametrize('data_gen', struct_gens, ids=idfn) @pytest.mark.parametrize('join_type', ['Inner', 'Left', 'Right', 'Cross', 'LeftSemi', 'LeftAnti'], ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_broadcast_join_right_struct_as_key(data_gen, join_type): def do_join(spark): left, right = create_df(spark, data_gen, 500, 250) @@ -767,7 +765,7 @@ def do_join(spark): @ignore_order(local=True) @pytest.mark.parametrize('data_gen', struct_gens, ids=idfn) @pytest.mark.parametrize('join_type', ['Inner', 'Left', 'Right', 'Cross', 'LeftSemi', 'LeftAnti'], ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_broadcast_join_right_struct_mixed_key(data_gen, join_type): def do_join(spark): left = two_col_df(spark, data_gen, int_gen, length=500) @@ -788,11 +786,11 @@ def do_join(spark): assert_gpu_and_cpu_are_equal_collect(do_join, conf=_sortmerge_join_conf) @allow_non_gpu('SortMergeJoinExec', 'SortExec', 'NormalizeNaNAndZero', 'CreateNamedStruct', - 'GetStructField', 'Literal', 'If', 'IsNull', 'ShuffleExchangeExec', 'HashPartitioning') + 'GetStructField', 'Literal', 'If', 'IsNull', 'ShuffleExchangeExec', 'HashPartitioning', + *non_utc_allow) @ignore_order(local=True) @pytest.mark.parametrize('data_gen', struct_gens, ids=idfn) @pytest.mark.parametrize('join_type', ['FullOuter'], ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_sortmerge_join_struct_as_key_fallback(data_gen, join_type): def do_join(spark): left, right = create_df(spark, data_gen, 500, 500) diff --git a/integration_tests/src/main/python/json_test.py b/integration_tests/src/main/python/json_test.py index 41571a203d5e..23c90acc132b 100644 --- a/integration_tests/src/main/python/json_test.py +++ b/integration_tests/src/main/python/json_test.py @@ -184,7 +184,7 @@ def test_json_date_formats_round_trip(spark_tmp_path, date_format, v1_enabled_li @pytest.mark.parametrize('ts_part', json_supported_ts_parts) @pytest.mark.parametrize('date_format', json_supported_date_formats) @pytest.mark.parametrize('v1_enabled_list', ["", "json"]) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_json_ts_formats_round_trip(spark_tmp_path, date_format, ts_part, v1_enabled_list): full_format = date_format + ts_part data_gen = TimestampGen() @@ -203,21 +203,20 @@ def test_json_ts_formats_round_trip(spark_tmp_path, date_format, ts_part, v1_ena .json(data_path), conf=updated_conf) -@allow_non_gpu('FileSourceScanExec', 'ProjectExec') +@allow_non_gpu('FileSourceScanExec', 'ProjectExec', *non_utc_allow) @pytest.mark.skipif(is_before_spark_341(), reason='`TIMESTAMP_NTZ` is only supported in PySpark 341+') @pytest.mark.parametrize('ts_part', json_supported_ts_parts) @pytest.mark.parametrize('date_format', json_supported_date_formats) @pytest.mark.parametrize("timestamp_type", ["TIMESTAMP_LTZ", "TIMESTAMP_NTZ"]) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_json_ts_formats_round_trip_ntz_v1(spark_tmp_path, date_format, ts_part, timestamp_type): json_ts_formats_round_trip_ntz(spark_tmp_path, date_format, ts_part, timestamp_type, 'json', 'FileSourceScanExec') -@allow_non_gpu('BatchScanExec', 'ProjectExec') +@allow_non_gpu('BatchScanExec', 'ProjectExec', *non_utc_allow) @pytest.mark.skipif(is_before_spark_341(), reason='`TIMESTAMP_NTZ` is only supported in PySpark 341+') @pytest.mark.parametrize('ts_part', json_supported_ts_parts) @pytest.mark.parametrize('date_format', json_supported_date_formats) @pytest.mark.parametrize("timestamp_type", ["TIMESTAMP_LTZ", "TIMESTAMP_NTZ"]) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') + def test_json_ts_formats_round_trip_ntz_v2(spark_tmp_path, date_format, ts_part, timestamp_type): json_ts_formats_round_trip_ntz(spark_tmp_path, date_format, ts_part, timestamp_type, '', 'BatchScanExec') @@ -397,7 +396,7 @@ def test_json_read_invalid_dates(std_input_path, filename, schema, read_func, an 'CORRECTED', 'EXCEPTION' ]) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_json_read_valid_timestamps(std_input_path, filename, schema, read_func, ansi_enabled, time_parser_policy, \ spark_tmp_table_factory): updated_conf = copy_and_update(_enable_all_types_conf, @@ -455,7 +454,7 @@ def test_json_read_count(spark_tmp_path, v1_enabled_list): lambda spark : spark.read.schema(schema).json(data_path), conf=updated_conf) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_from_json_map(): # The test here is working around some inconsistencies in how the keys are parsed for maps # on the GPU the keys are dense, but on the CPU they are sparse @@ -490,7 +489,7 @@ def test_from_json_map_fallback(): 'struct', 'struct', ]) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_from_json_struct(schema): # note that column 'a' does not use leading zeroes due to https://github.com/NVIDIA/spark-rapids/issues/9588 json_string_gen = StringGen(r'{"a": [1-9]{0,5}, "b": "[A-Z]{0,5}", "c": 1\d\d\d}') \ @@ -510,7 +509,7 @@ def test_from_json_struct(schema): r'{ "bool": [0-9]{4}-[0-9]{2}-[0-9]{2} }', r'{ "bool": "[0-9]{4}-[0-9]{2}-[0-9]{2}" }' ]) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_from_json_struct_boolean(pattern): json_string_gen = StringGen(pattern) \ .with_special_case('', weight=50) \ @@ -520,7 +519,7 @@ def test_from_json_struct_boolean(pattern): .select(f.col('a'), f.from_json('a', 'struct')), conf={"spark.rapids.sql.expression.JsonToStructs": True}) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_from_json_struct_decimal(): json_string_gen = StringGen(r'{ "a": "[+-]?([0-9]{0,5})?(\.[0-9]{0,2})?([eE][+-]?[0-9]{1,2})?" }') \ .with_special_pattern('', weight=50) \ @@ -560,7 +559,7 @@ def test_from_json_struct_decimal(): pytest.param("LEGACY", marks=pytest.mark.allow_non_gpu('ProjectExec')), "CORRECTED" ]) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_from_json_struct_date(date_gen, date_format, time_parser_policy): json_string_gen = StringGen(r'{ "a": ' + date_gen + ' }') \ .with_special_case('{ "a": null }') \ @@ -643,7 +642,7 @@ def test_from_json_struct_date_fallback_non_default_format(date_gen, date_format "CORRECTED" ]) @pytest.mark.parametrize('ansi_enabled', [ True, False ]) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_from_json_struct_timestamp(timestamp_gen, timestamp_format, time_parser_policy, ansi_enabled): json_string_gen = StringGen(r'{ "a": ' + timestamp_gen + ' }') \ .with_special_case('{ "a": null }') \ @@ -695,7 +694,7 @@ def test_from_json_struct_timestamp_fallback_non_default_format(timestamp_gen, t @pytest.mark.parametrize('schema', ['struct', 'struct>', 'struct>']) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_from_json_struct_of_struct(schema): json_string_gen = StringGen(r'{"teacher": "[A-Z]{1}[a-z]{2,5}",' \ r'"student": {"name": "[A-Z]{1}[a-z]{2,5}", "age": 1\d}}') \ @@ -710,7 +709,7 @@ def test_from_json_struct_of_struct(schema): @pytest.mark.parametrize('schema', ['struct', 'struct>>', 'struct>>']) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_from_json_struct_of_list(schema): json_string_gen = StringGen(r'{"teacher": "[A-Z]{1}[a-z]{2,5}",' \ r'"student": \[{"name": "[A-Z]{1}[a-z]{2,5}", "class": "junior"},' \ @@ -723,7 +722,7 @@ def test_from_json_struct_of_list(schema): conf={"spark.rapids.sql.expression.JsonToStructs": True}) @pytest.mark.parametrize('schema', ['struct', 'struct']) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_from_json_struct_all_empty_string_input(schema): json_string_gen = StringGen('') assert_gpu_and_cpu_are_equal_collect( @@ -800,7 +799,7 @@ def test_read_case_col_name(spark_tmp_path, v1_enabled_list, col_name): pytest.param(True, marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/9517')), False ]) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_structs_to_json(spark_tmp_path, data_gen, ignore_null_fields, pretty): struct_gen = StructGen([ ('a', data_gen), diff --git a/integration_tests/src/main/python/map_test.py b/integration_tests/src/main/python/map_test.py index dbd02884ddc7..d3f332dce7f1 100644 --- a/integration_tests/src/main/python/map_test.py +++ b/integration_tests/src/main/python/map_test.py @@ -57,7 +57,7 @@ @pytest.mark.parametrize('data_gen', supported_key_map_gens, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_map_keys(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark: unary_op_df(spark, data_gen).selectExpr( @@ -69,7 +69,7 @@ def test_map_keys(data_gen): @pytest.mark.parametrize('data_gen', supported_key_map_gens, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_map_values(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark: unary_op_df(spark, data_gen).selectExpr( @@ -152,7 +152,7 @@ def test_get_map_value_numeric_keys(data_gen): @pytest.mark.parametrize('data_gen', supported_key_map_gens, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_get_map_value_supported_keys(data_gen): key_gen = data_gen._key_gen # first expression is not guaranteed to hit @@ -191,7 +191,7 @@ def query_map_scalar(spark): @allow_non_gpu('WindowLocalExec') @datagen_overrides(seed=0, reason='https://github.com/NVIDIA/spark-rapids/issues/9683') @pytest.mark.parametrize('data_gen', supported_key_map_gens, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_map_scalars_supported_key_types(data_gen): key_gen = data_gen._key_gen def query_map_scalar(spark): @@ -229,7 +229,7 @@ def query_map_scalar(spark): @pytest.mark.parametrize('data_gen', [MapGen(DateGen(nullable=False), value(), max_length=6) for value in get_map_value_gens()], ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_get_map_value_date_keys(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark: unary_op_df(spark, data_gen).selectExpr( @@ -241,7 +241,7 @@ def test_get_map_value_date_keys(data_gen): @pytest.mark.parametrize('data_gen', [MapGen(TimestampGen(nullable=False), value(), max_length=6) for value in get_map_value_gens()], ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_get_map_value_timestamp_keys(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark: unary_op_df(spark, data_gen).selectExpr( @@ -260,7 +260,7 @@ def test_map_side_effects(): @pytest.mark.parametrize('key_gen', [StringGen(nullable=False), IntegerGen(nullable=False), basic_struct_gen], ids=idfn) @pytest.mark.parametrize('value_gen', [StringGen(nullable=True), IntegerGen(nullable=True), basic_struct_gen], ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_single_entry_map(key_gen, value_gen): data_gen = [('a', key_gen), ('b', value_gen)] assert_gpu_and_cpu_are_equal_collect( @@ -464,7 +464,7 @@ def test_simple_get_map_value_with_strict_index(strict_index, data_gen): [MapGen(StringGen(pattern='key_[0-9]', nullable=False), value(), max_length=6) for value in get_map_value_gens()], ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_element_at_map_string_keys(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark: unary_op_df(spark, data_gen).selectExpr( @@ -478,7 +478,7 @@ def test_element_at_map_string_keys(data_gen): @pytest.mark.parametrize('data_gen', numeric_key_map_gens, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_element_at_map_numeric_keys(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark: unary_op_df(spark, data_gen).selectExpr( @@ -494,7 +494,7 @@ def test_element_at_map_numeric_keys(data_gen): [MapGen(DecimalGen(precision=35, scale=2, nullable=False), value(), max_length=6) for value in get_map_value_gens(precision=37, scale=0)], ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_get_map_value_element_at_map_dec_col_keys(data_gen): keys = DecimalGen(precision=35, scale=2) assert_gpu_and_cpu_are_equal_collect( @@ -520,7 +520,7 @@ def test_get_map_value_element_at_map_string_col_keys_ansi(data_gen, ansi): [MapGen(StringGen(pattern='key_[0-9]', nullable=False), value(), max_length=6) for value in get_map_value_gens(precision=37, scale=0)], ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_get_map_value_element_at_map_string_col_keys(data_gen): keys = StringGen(pattern='key_[0-9]') assert_gpu_and_cpu_are_equal_collect( @@ -577,7 +577,7 @@ def test_get_map_value_string_col_keys_ansi_null(data_gen): @pytest.mark.parametrize('data_gen', [MapGen(DateGen(nullable=False), value(), max_length=6) for value in get_map_value_gens()], ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_element_at_map_date_keys(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark: unary_op_df(spark, data_gen).selectExpr( @@ -591,7 +591,7 @@ def test_element_at_map_date_keys(data_gen): [MapGen(TimestampGen(nullable=False), value(), max_length=6) for value in get_map_value_gens()], ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_element_at_map_timestamp_keys(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark: unary_op_df(spark, data_gen).selectExpr( @@ -623,7 +623,7 @@ def test_map_element_at_ansi_null(data_gen): conf=ansi_enabled_conf) @pytest.mark.parametrize('data_gen', map_gens_sample, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_transform_values(data_gen): def do_it(spark): columns = ['a', 'b', @@ -662,7 +662,7 @@ def do_it(spark): @pytest.mark.parametrize('data_gen', map_gens_sample + decimal_128_map_gens + decimal_64_map_gens, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_transform_keys(data_gen): # The processing here is very limited, because we need to be sure we do not create duplicate keys. # This can happen because of integer overflow, round off errors in floating point, etc. So for now @@ -722,7 +722,7 @@ def test_sql_map_scalars(query): @pytest.mark.parametrize('data_gen', map_gens_sample, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_map_filter(data_gen): columns = ['map_filter(a, (key, value) -> isnotnull(value) )', 'map_filter(a, (key, value) -> isnull(value) )', diff --git a/integration_tests/src/main/python/orc_cast_test.py b/integration_tests/src/main/python/orc_cast_test.py index cccd60125b98..5f6838df6887 100644 --- a/integration_tests/src/main/python/orc_cast_test.py +++ b/integration_tests/src/main/python/orc_cast_test.py @@ -17,6 +17,7 @@ from asserts import assert_gpu_and_cpu_are_equal_collect, assert_gpu_and_cpu_error from conftest import is_not_utc from data_gen import * +from marks import allow_non_gpu from pyspark.sql.types import * from spark_session import with_cpu_session from orc_test import reader_opt_confs @@ -50,7 +51,7 @@ def test_casting_among_integer_types(spark_tmp_path, reader_confs, v1_enabled_li @pytest.mark.parametrize('to_type', ['float', 'double', 'string', 'timestamp']) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_casting_from_integer(spark_tmp_path, to_type): orc_path = spark_tmp_path + '/orc_cast_integer' # The Python 'datetime' module only supports a max-year of 10000, so we set the Long type max @@ -72,7 +73,7 @@ def test_casting_from_integer(spark_tmp_path, to_type): @pytest.mark.parametrize('overflow_long_gen', [LongGen(min_val=int(1e16)), LongGen(max_val=int(-1e16))]) @pytest.mark.parametrize('to_type', ['timestamp']) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_casting_from_overflow_long(spark_tmp_path, overflow_long_gen,to_type): # Timestamp(micro-seconds) is actually type of int64, when casting long(int64) to timestamp, # we need to multiply 1e6 (or 1e3), and it may cause overflow. This function aims to test @@ -103,7 +104,7 @@ def test_casting_from_float_and_double(spark_tmp_path, to_type): @pytest.mark.parametrize('data_gen', [DoubleGen(max_exp=32, special_cases=None), DoubleGen(max_exp=32, special_cases=[8.88e9, 9.99e10, 1.314e11])]) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_casting_from_double_to_timestamp(spark_tmp_path, data_gen): # ORC will assume the original double value in seconds, we need to convert them to # timestamp(INT64 in micro-seconds). @@ -127,7 +128,7 @@ def test_casting_from_double_to_timestamp(spark_tmp_path, data_gen): ) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_casting_from_overflow_double_to_timestamp(spark_tmp_path): orc_path = spark_tmp_path + '/orc_casting_from_overflow_double_to_timestamp' with_cpu_session( diff --git a/integration_tests/src/main/python/orc_test.py b/integration_tests/src/main/python/orc_test.py index 409d08509878..f539e5f6c748 100644 --- a/integration_tests/src/main/python/orc_test.py +++ b/integration_tests/src/main/python/orc_test.py @@ -68,7 +68,7 @@ def get_orc_timestamp_gen(nullable=True): @pytest.mark.parametrize('v1_enabled_list', ["", "orc"]) @pytest.mark.parametrize('orc_impl', ["native", "hive"]) @pytest.mark.parametrize('reader_confs', reader_opt_confs, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_basic_read(std_input_path, name, read_func, v1_enabled_list, orc_impl, reader_confs): all_confs = copy_and_update(reader_confs, { 'spark.sql.sources.useV1SourceList': v1_enabled_list, @@ -160,7 +160,7 @@ def test_orc_fallback(spark_tmp_path, read_func, disable_conf): @pytest.mark.parametrize('read_func', [read_orc_df, read_orc_sql]) @pytest.mark.parametrize('reader_confs', reader_opt_confs, ids=idfn) @pytest.mark.parametrize('v1_enabled_list', ["", "orc"]) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_read_round_trip(spark_tmp_path, orc_gens, read_func, reader_confs, v1_enabled_list): gen_list = [('_c' + str(i), gen) for i, gen in enumerate(orc_gens)] data_path = spark_tmp_path + '/ORC_DATA' @@ -186,7 +186,7 @@ def test_read_round_trip(spark_tmp_path, orc_gens, read_func, reader_confs, v1_e @pytest.mark.parametrize('read_func', [read_orc_df, read_orc_sql]) @pytest.mark.parametrize('v1_enabled_list', ["", "orc"]) @pytest.mark.parametrize('reader_confs', reader_opt_confs, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_pred_push_round_trip(spark_tmp_path, orc_gen, read_func, v1_enabled_list, reader_confs): data_path = spark_tmp_path + '/ORC_DATA' # Append two struct columns to verify nested predicate pushdown. @@ -243,7 +243,7 @@ def test_compress_read_round_trip(spark_tmp_path, compress, v1_enabled_list, rea @pytest.mark.parametrize('v1_enabled_list', ["", "orc"]) @pytest.mark.parametrize('reader_confs', reader_opt_confs, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_simple_partitioned_read(spark_tmp_path, v1_enabled_list, reader_confs): # Once https://github.com/NVIDIA/spark-rapids/issues/131 is fixed # we should go with a more standard set of generators @@ -310,7 +310,7 @@ def test_partitioned_read_just_partitions(spark_tmp_path, v1_enabled_list, reade @pytest.mark.parametrize('v1_enabled_list', ["", "orc"]) @pytest.mark.parametrize('reader_confs', reader_opt_confs, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_merge_schema_read(spark_tmp_path, v1_enabled_list, reader_confs): # Once https://github.com/NVIDIA/spark-rapids/issues/131 is fixed # we should go with a more standard set of generators @@ -589,7 +589,7 @@ def test_read_struct_without_stream(spark_tmp_path): @pytest.mark.parametrize('reader_confs', reader_opt_confs, ids=idfn) @pytest.mark.parametrize('v1_enabled_list', ["", "orc"]) @pytest.mark.parametrize('case_sensitive', ["false", "true"]) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_read_with_more_columns(spark_tmp_path, orc_gen, reader_confs, v1_enabled_list, case_sensitive): struct_gen = StructGen([('nested_col', orc_gen)]) # Map is not supported yet. @@ -777,7 +777,7 @@ def test_orc_read_varchar_as_string(std_input_path): @pytest.mark.parametrize('gens', orc_gens_list, ids=idfn) @pytest.mark.parametrize('keep_order', [True, pytest.param(False, marks=pytest.mark.ignore_order(local=True))]) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_read_round_trip_for_multithreaded_combining(spark_tmp_path, gens, keep_order): gen_list = [('_c' + str(i), gen) for i, gen in enumerate(gens)] data_path = spark_tmp_path + '/ORC_DATA' @@ -792,7 +792,7 @@ def test_read_round_trip_for_multithreaded_combining(spark_tmp_path, gens, keep_ @pytest.mark.parametrize('keep_order', [True, pytest.param(False, marks=pytest.mark.ignore_order(local=True))]) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_simple_partitioned_read_for_multithreaded_combining(spark_tmp_path, keep_order): orc_gens = [byte_gen, short_gen, int_gen, long_gen, float_gen, double_gen, string_gen, boolean_gen, DateGen(start=date(1590, 1, 1)), diff --git a/integration_tests/src/main/python/orc_write_test.py b/integration_tests/src/main/python/orc_write_test.py index 5617f8e20e55..c6244921677a 100644 --- a/integration_tests/src/main/python/orc_write_test.py +++ b/integration_tests/src/main/python/orc_write_test.py @@ -81,7 +81,7 @@ @pytest.mark.parametrize('orc_gens', orc_write_gens_list, ids=idfn) @pytest.mark.parametrize('orc_impl', ["native", "hive"]) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_write_round_trip(spark_tmp_path, orc_gens, orc_impl): gen_list = [('_c' + str(i), gen) for i, gen in enumerate(orc_gens)] data_path = spark_tmp_path + '/ORC_DATA' @@ -116,7 +116,7 @@ def test_write_round_trip_corner(spark_tmp_path, orc_gen, orc_impl): # There are race conditions around when individual files are read in for partitioned data @ignore_order @pytest.mark.parametrize('orc_gen', orc_part_write_gens, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_part_write_round_trip(spark_tmp_path, orc_gen): gen_list = [('a', RepeatSeqGen(orc_gen, 10)), ('b', orc_gen)] @@ -170,8 +170,8 @@ def test_compress_write_round_trip(spark_tmp_path, compress): @pytest.mark.order(2) @pytest.mark.parametrize('orc_gens', orc_write_gens_list, ids=idfn) @pytest.mark.parametrize('orc_impl', ["native", "hive"]) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') -def test_write_save_table(spark_tmp_path, orc_gens, orc_impl, spark_tmp_table_factory): +@allow_non_gpu(*non_utc_allow) +def test_write_save_table_orc(spark_tmp_path, orc_gens, orc_impl, spark_tmp_table_factory): gen_list = [('_c' + str(i), gen) for i, gen in enumerate(orc_gens)] data_path = spark_tmp_path + '/ORC_DATA' all_confs={'spark.sql.sources.useV1SourceList': "orc", @@ -193,7 +193,7 @@ def write_orc_sql_from(spark, df, data_path, write_to_table): @pytest.mark.parametrize('orc_gens', orc_write_gens_list, ids=idfn) @pytest.mark.parametrize('ts_type', ["TIMESTAMP_MICROS", "TIMESTAMP_MILLIS"]) @pytest.mark.parametrize('orc_impl', ["native", "hive"]) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_write_sql_save_table(spark_tmp_path, orc_gens, ts_type, orc_impl, spark_tmp_table_factory): gen_list = [('_c' + str(i), gen) for i, gen in enumerate(orc_gens)] data_path = spark_tmp_path + '/ORC_DATA' @@ -203,9 +203,8 @@ def test_write_sql_save_table(spark_tmp_path, orc_gens, ts_type, orc_impl, spark data_path, conf={'spark.sql.orc.impl': orc_impl, 'spark.rapids.sql.format.orc.write.enabled': True}) -@allow_non_gpu('DataWritingCommandExec,ExecutedCommandExec,WriteFilesExec') +@allow_non_gpu('DataWritingCommandExec,ExecutedCommandExec,WriteFilesExec', *non_utc_allow) @pytest.mark.parametrize('codec', ['zlib', 'lzo']) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_orc_write_compression_fallback(spark_tmp_path, codec, spark_tmp_table_factory): gen = TimestampGen() data_path = spark_tmp_path + '/PARQUET_DATA' @@ -262,7 +261,7 @@ def sql_write(spark, path): @pytest.mark.parametrize('orc_gens', orc_write_gens_list, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_write_empty_orc_round_trip(spark_tmp_path, orc_gens): def create_empty_df(spark, path): gen_list = [('_c' + str(i), gen) for i, gen in enumerate(orc_gens)] diff --git a/integration_tests/src/main/python/parquet_test.py b/integration_tests/src/main/python/parquet_test.py index f6cc2a0141b1..26f90b19ceb8 100644 --- a/integration_tests/src/main/python/parquet_test.py +++ b/integration_tests/src/main/python/parquet_test.py @@ -164,7 +164,7 @@ def setup_table(spark): @pytest.mark.parametrize('read_func', [read_parquet_df, read_parquet_sql]) @pytest.mark.parametrize('reader_confs', reader_opt_confs) @pytest.mark.parametrize('v1_enabled_list', ["", "parquet"]) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_parquet_read_round_trip(spark_tmp_path, parquet_gens, read_func, reader_confs, v1_enabled_list): gen_list = [('_c' + str(i), gen) for i, gen in enumerate(parquet_gens)] data_path = spark_tmp_path + '/PARQUET_DATA' @@ -299,7 +299,7 @@ def test_parquet_compress_read_round_trip(spark_tmp_path, compress, v1_enabled_l @pytest.mark.parametrize('read_func', [read_parquet_df, read_parquet_sql]) @pytest.mark.parametrize('reader_confs', reader_opt_confs) @pytest.mark.parametrize('v1_enabled_list', ["", "parquet"]) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_parquet_pred_push_round_trip(spark_tmp_path, parquet_gen, read_func, v1_enabled_list, reader_confs): data_path = spark_tmp_path + '/PARQUET_DATA' gen_list = [('a', RepeatSeqGen(parquet_gen, 100)), ('b', parquet_gen)] @@ -319,7 +319,7 @@ def test_parquet_pred_push_round_trip(spark_tmp_path, parquet_gen, read_func, v1 @pytest.mark.parametrize('ts_rebase_read', [('CORRECTED', 'LEGACY'), ('LEGACY', 'CORRECTED')]) @pytest.mark.parametrize('reader_confs', reader_opt_confs) @pytest.mark.parametrize('v1_enabled_list', ["", "parquet"]) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_parquet_read_roundtrip_datetime_with_legacy_rebase(spark_tmp_path, parquet_gens, ts_type, ts_rebase_write, ts_rebase_read, reader_confs, v1_enabled_list): @@ -359,7 +359,7 @@ def test_parquet_decimal_read_legacy(spark_tmp_path, parquet_gens, read_func, re @pytest.mark.parametrize('reader_confs', reader_opt_confs) @pytest.mark.parametrize('v1_enabled_list', ["", "parquet"]) @pytest.mark.parametrize('batch_size', [100, INT_MAX]) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_parquet_simple_partitioned_read(spark_tmp_path, v1_enabled_list, reader_confs, batch_size): # Once https://github.com/NVIDIA/spark-rapids/issues/133 and https://github.com/NVIDIA/spark-rapids/issues/132 are fixed # we should go with a more standard set of generators @@ -391,7 +391,7 @@ def test_parquet_simple_partitioned_read(spark_tmp_path, v1_enabled_list, reader # In this we are reading the data, but only reading the key the data was partitioned by @pytest.mark.parametrize('reader_confs', reader_opt_confs) @pytest.mark.parametrize('v1_enabled_list', ["", "parquet"]) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_parquet_partitioned_read_just_partitions(spark_tmp_path, v1_enabled_list, reader_confs): parquet_gens = [byte_gen] gen_list = [('_c' + str(i), gen) for i, gen in enumerate(parquet_gens)] @@ -534,7 +534,7 @@ def read_and_remove(spark): @pytest.mark.parametrize('reader_confs', reader_opt_confs) @pytest.mark.parametrize('v1_enabled_list', ["", "parquet"]) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_parquet_read_merge_schema(spark_tmp_path, v1_enabled_list, reader_confs): # Once https://github.com/NVIDIA/spark-rapids/issues/133 and https://github.com/NVIDIA/spark-rapids/issues/132 are fixed # we should go with a more standard set of generators @@ -559,7 +559,7 @@ def test_parquet_read_merge_schema(spark_tmp_path, v1_enabled_list, reader_confs @pytest.mark.parametrize('reader_confs', reader_opt_confs) @pytest.mark.parametrize('v1_enabled_list', ["", "parquet"]) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_parquet_read_merge_schema_from_conf(spark_tmp_path, v1_enabled_list, reader_confs): # Once https://github.com/NVIDIA/spark-rapids/issues/133 and https://github.com/NVIDIA/spark-rapids/issues/132 are fixed # we should go with a more standard set of generators @@ -875,7 +875,7 @@ def test_parquet_reading_from_unaligned_pages_basic_filters(spark_tmp_path, read @pytest.mark.parametrize('reader_confs', reader_opt_confs, ids=idfn) @pytest.mark.parametrize('enable_dictionary', ["true", "false"], ids=idfn) @pytest.mark.parametrize('v1_enabled_list', ["", "parquet"]) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_parquet_reading_from_unaligned_pages_all_types(spark_tmp_path, reader_confs, enable_dictionary, v1_enabled_list): all_confs = copy_and_update(reader_confs, {'spark.sql.sources.useV1SourceList': v1_enabled_list}) data_path = spark_tmp_path + '/PARQUET_UNALIGNED_DATA' @@ -903,7 +903,7 @@ def test_parquet_reading_from_unaligned_pages_all_types(spark_tmp_path, reader_c @pytest.mark.parametrize('reader_confs', reader_opt_confs, ids=idfn) @pytest.mark.parametrize('enable_dictionary', ["true", "false"], ids=idfn) @pytest.mark.parametrize('v1_enabled_list', ["", "parquet"]) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_parquet_reading_from_unaligned_pages_all_types_dict_optimized(spark_tmp_path, reader_confs, enable_dictionary, v1_enabled_list): all_confs = copy_and_update(reader_confs, {'spark.sql.sources.useV1SourceList': v1_enabled_list}) data_path = spark_tmp_path + '/PARQUET_UNALIGNED_DATA' diff --git a/integration_tests/src/main/python/parquet_testing_test.py b/integration_tests/src/main/python/parquet_testing_test.py index a4600de7b860..abf06d11e5fb 100644 --- a/integration_tests/src/main/python/parquet_testing_test.py +++ b/integration_tests/src/main/python/parquet_testing_test.py @@ -17,7 +17,8 @@ from asserts import assert_gpu_and_cpu_are_equal_collect, assert_gpu_and_cpu_error from conftest import get_std_input_path, is_parquet_testing_tests_forced, is_precommit_run, is_not_utc -from data_gen import copy_and_update +from data_gen import * +from marks import allow_non_gpu from pathlib import Path import pytest from spark_session import is_before_spark_330, is_spark_350_or_later @@ -122,7 +123,7 @@ def gen_testing_params_for_valid_files(): @pytest.mark.parametrize("path", gen_testing_params_for_valid_files()) @pytest.mark.parametrize("confs", [_native_reader_confs, _java_reader_confs]) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_parquet_testing_valid_files(path, confs): assert_gpu_and_cpu_are_equal_collect(lambda spark: spark.read.parquet(path), conf=confs) diff --git a/integration_tests/src/main/python/parquet_write_test.py b/integration_tests/src/main/python/parquet_write_test.py index 9584f2a35202..00079fffe0dc 100644 --- a/integration_tests/src/main/python/parquet_write_test.py +++ b/integration_tests/src/main/python/parquet_write_test.py @@ -90,7 +90,7 @@ @pytest.mark.order(1) # at the head of xdist worker queue if pytest-order is installed @pytest.mark.parametrize('parquet_gens', parquet_write_gens_list, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_write_round_trip(spark_tmp_path, parquet_gens): gen_list = [('_c' + str(i), gen) for i, gen in enumerate(parquet_gens)] data_path = spark_tmp_path + '/PARQUET_DATA' @@ -136,7 +136,7 @@ def test_write_round_trip_corner(spark_tmp_path, par_gen): ArrayGen(TimestampGen(), max_length=10), MapGen(TimestampGen(nullable=False), TimestampGen())]], ids=idfn) @pytest.mark.parametrize('ts_type', parquet_ts_write_options) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_timestamp_write_round_trip(spark_tmp_path, parquet_gens, ts_type): gen_list = [('_c' + str(i), gen) for i, gen in enumerate(parquet_gens)] data_path = spark_tmp_path + '/PARQUET_DATA' @@ -150,7 +150,7 @@ def test_timestamp_write_round_trip(spark_tmp_path, parquet_gens, ts_type): @pytest.mark.parametrize('ts_type', parquet_ts_write_options) @pytest.mark.parametrize('ts_rebase', ['CORRECTED']) @ignore_order -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_write_ts_millis(spark_tmp_path, ts_type, ts_rebase): gen = TimestampGen() data_path = spark_tmp_path + '/PARQUET_DATA' @@ -174,7 +174,7 @@ def test_write_ts_millis(spark_tmp_path, ts_type, ts_rebase): @ignore_order @pytest.mark.order(1) # at the head of xdist worker queue if pytest-order is installed @pytest.mark.parametrize('parquet_gen', parquet_part_write_gens, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_part_write_round_trip(spark_tmp_path, parquet_gen): gen_list = [('a', RepeatSeqGen(parquet_gen, 10)), ('b', parquet_gen)] @@ -188,8 +188,7 @@ def test_part_write_round_trip(spark_tmp_path, parquet_gen): @pytest.mark.skipif(is_spark_340_or_later() or is_databricks122_or_later(), reason="`WriteFilesExec` is only supported in Spark 340+") @pytest.mark.parametrize('data_gen', [TimestampGen()], ids=idfn) -@pytest.mark.allow_non_gpu("DataWritingCommandExec") -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@pytest.mark.allow_non_gpu("DataWritingCommandExec", *non_utc_allow) def test_int96_write_conf(spark_tmp_path, data_gen): data_path = spark_tmp_path + '/PARQUET_DATA' confs = copy_and_update(writer_confs, { @@ -206,8 +205,7 @@ def test_int96_write_conf(spark_tmp_path, data_gen): @pytest.mark.skipif(is_before_spark_340() and not is_databricks122_or_later(), reason="`WriteFilesExec` is only supported in Spark 340+") @pytest.mark.parametrize('data_gen', [TimestampGen()], ids=idfn) # Note: From Spark 340, WriteFilesExec is introduced. -@pytest.mark.allow_non_gpu("DataWritingCommandExec", "WriteFilesExec") -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@pytest.mark.allow_non_gpu("DataWritingCommandExec", "WriteFilesExec", *non_utc_allow) def test_int96_write_conf_with_write_exec(spark_tmp_path, data_gen): data_path = spark_tmp_path + '/PARQUET_DATA' confs = copy_and_update(writer_confs, { @@ -221,7 +219,7 @@ def test_int96_write_conf_with_write_exec(spark_tmp_path, data_gen): ['DataWritingCommandExec', 'WriteFilesExec'], confs) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_all_null_int96(spark_tmp_path): class AllNullTimestampGen(TimestampGen): def start(self, rand): @@ -251,7 +249,7 @@ def test_compress_write_round_trip(spark_tmp_path, compress): @pytest.mark.order(2) @pytest.mark.parametrize('parquet_gens', parquet_write_gens_list, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_write_save_table(spark_tmp_path, parquet_gens, spark_tmp_table_factory): gen_list = [('_c' + str(i), gen) for i, gen in enumerate(parquet_gens)] data_path = spark_tmp_path + '/PARQUET_DATA' @@ -269,7 +267,7 @@ def write_parquet_sql_from(spark, df, data_path, write_to_table): @pytest.mark.order(2) @pytest.mark.parametrize('parquet_gens', parquet_write_gens_list, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_write_sql_save_table(spark_tmp_path, parquet_gens, spark_tmp_table_factory): gen_list = [('_c' + str(i), gen) for i, gen in enumerate(parquet_gens)] data_path = spark_tmp_path + '/PARQUET_DATA' @@ -292,7 +290,7 @@ def writeParquetUpgradeCatchException(spark, df, data_path, spark_tmp_table_fact ('TIMESTAMP_MICROS', TimestampGen(start=datetime(1, 2, 1, tzinfo=timezone.utc), end=datetime(1899, 12, 31, tzinfo=timezone.utc))), ('TIMESTAMP_MILLIS', TimestampGen(start=datetime(1, 2, 1, tzinfo=timezone.utc), end=datetime(1899, 12, 31, tzinfo=timezone.utc)))]) @pytest.mark.parametrize('rebase', ["CORRECTED","EXCEPTION"]) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_ts_write_fails_datetime_exception(spark_tmp_path, ts_write_data_gen, spark_tmp_table_factory, rebase): ts_write, gen = ts_write_data_gen data_path = spark_tmp_path + '/PARQUET_DATA' @@ -471,7 +469,7 @@ def generate_map_with_empty_validity(spark, path): @pytest.mark.parametrize('data_gen', parquet_nested_datetime_gen, ids=idfn) @pytest.mark.parametrize('ts_write', parquet_ts_write_options) @pytest.mark.parametrize('ts_rebase_write', ['EXCEPTION']) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_parquet_write_fails_legacy_datetime(spark_tmp_path, data_gen, ts_write, ts_rebase_write): data_path = spark_tmp_path + '/PARQUET_DATA' all_confs = {'spark.sql.parquet.outputTimestampType': ts_write, @@ -489,7 +487,7 @@ def writeParquetCatchException(spark, data_gen, data_path): @pytest.mark.parametrize('ts_write', parquet_ts_write_options) @pytest.mark.parametrize('ts_rebase_write', [('CORRECTED', 'LEGACY'), ('LEGACY', 'CORRECTED')]) @pytest.mark.parametrize('ts_rebase_read', [('CORRECTED', 'LEGACY'), ('LEGACY', 'CORRECTED')]) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_parquet_write_roundtrip_datetime_with_legacy_rebase(spark_tmp_path, data_gen, ts_write, ts_rebase_write, ts_rebase_read): data_path = spark_tmp_path + '/PARQUET_DATA' @@ -533,7 +531,7 @@ def test_it(spark): with_gpu_session(test_it, conf) @pytest.mark.parametrize('parquet_gens', parquet_write_gens_list, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_write_empty_parquet_round_trip(spark_tmp_path, parquet_gens): def create_empty_df(spark, path): gen_list = [('_c' + str(i), gen) for i, gen in enumerate(parquet_gens)] @@ -773,7 +771,7 @@ def read_table(spark, path): # Test to avoid regression on a known bug in Spark. For details please visit https://github.com/NVIDIA/spark-rapids/issues/8693 @pytest.mark.parametrize('ts_rebase', ['LEGACY', 'CORRECTED']) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_hive_timestamp_value(spark_tmp_table_factory, spark_tmp_path, ts_rebase): def func_test(create_table, read_table, data_path, conf): assert_gpu_and_cpu_writes_are_equal_collect(create_table, read_table, data_path, conf=conf) diff --git a/integration_tests/src/main/python/qa_nightly_select_test.py b/integration_tests/src/main/python/qa_nightly_select_test.py index d1a894e8e00e..f783ca025180 100644 --- a/integration_tests/src/main/python/qa_nightly_select_test.py +++ b/integration_tests/src/main/python/qa_nightly_select_test.py @@ -24,8 +24,8 @@ from qa_nightly_sql import * import pytest from spark_session import with_cpu_session, is_jvm_charset_utf8 -from marks import approximate_float, ignore_order, incompat, qarun -from data_gen import copy_and_update +from marks import approximate_float, ignore_order, incompat, qarun, allow_non_gpu +from data_gen import * def num_stringDf(spark): print("### CREATE DATAFRAME 1 ####") @@ -159,7 +159,7 @@ def idfn(val): @incompat @qarun @pytest.mark.parametrize('sql_query_line', SELECT_SQL, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_select(sql_query_line, pytestconfig): sql_query = sql_query_line[0] if sql_query: @@ -172,7 +172,7 @@ def test_select(sql_query_line, pytestconfig): @incompat @qarun @pytest.mark.parametrize('sql_query_line', SELECT_NEEDS_SORT_SQL, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_needs_sort_select(sql_query_line, pytestconfig): sql_query = sql_query_line[0] if sql_query: diff --git a/integration_tests/src/main/python/repart_test.py b/integration_tests/src/main/python/repart_test.py index 28782e77cebe..54a540ce9a13 100644 --- a/integration_tests/src/main/python/repart_test.py +++ b/integration_tests/src/main/python/repart_test.py @@ -166,14 +166,14 @@ def test_union_by_name(data_gen): pytest.param([('array' + str(i), gen) for i, gen in enumerate(array_gens_sample + [ArrayGen(BinaryGen(max_length=5), max_length=5)])]), pytest.param([('map' + str(i), gen) for i, gen in enumerate(map_gens_sample)]), ], ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_coalesce_types(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark: gen_df(spark, data_gen).coalesce(2)) @pytest.mark.parametrize('num_parts', [1, 10, 100, 1000, 2000], ids=idfn) @pytest.mark.parametrize('length', [0, 2048, 4096], ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_coalesce_df(num_parts, length): #This should change eventually to be more than just the basic gens gen_list = [('_c' + str(i), gen) for i, gen in enumerate(all_basic_gens + decimal_gens + [binary_gen])] @@ -189,7 +189,7 @@ def test_coalesce_df(num_parts, length): @pytest.mark.parametrize('num_parts', [1, 10, 2345], ids=idfn) @pytest.mark.parametrize('length', [0, 2048, 4096], ids=idfn) @ignore_order(local=True) # To avoid extra data shuffle by 'sort on Spark' for this repartition test. -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_repartition_df(data_gen, num_parts, length): from pyspark.sql.functions import lit assert_gpu_and_cpu_are_equal_collect( @@ -206,7 +206,7 @@ def test_repartition_df(data_gen, num_parts, length): @pytest.mark.parametrize('num_parts', [1, 10, 2345], ids=idfn) @pytest.mark.parametrize('length', [0, 2048, 4096], ids=idfn) @ignore_order(local=True) # To avoid extra data shuffle by 'sort on Spark' for this repartition test. -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_repartition_df_for_round_robin(data_gen, num_parts, length): from pyspark.sql.functions import lit assert_gpu_and_cpu_are_equal_collect( @@ -280,7 +280,7 @@ def test_hash_fallback(data_gen): ([('a', decimal_gen_64bit), ('b', decimal_gen_64bit), ('c', decimal_gen_64bit)], ['a', 'b', 'c']), ([('a', decimal_gen_128bit), ('b', decimal_gen_128bit), ('c', decimal_gen_128bit)], ['a', 'b', 'c']), ], ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_hash_repartition_exact(gen, num_parts): data_gen = gen[0] part_on = gen[1] diff --git a/integration_tests/src/main/python/sort_test.py b/integration_tests/src/main/python/sort_test.py index 7a8d1c586127..3e447a5e7721 100644 --- a/integration_tests/src/main/python/sort_test.py +++ b/integration_tests/src/main/python/sort_test.py @@ -194,13 +194,13 @@ def test_single_nested_sort_in_part(data_gen, order, stable_sort): boolean_gen, timestamp_gen, date_gen, string_gen, null_gen, StructGen([('child0', long_gen)]) ] + orderable_decimal_gens + single_level_array_gens @pytest.mark.parametrize('data_gen', orderable_gens_sort, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_multi_orderby(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark : binary_op_df(spark, data_gen).orderBy(f.col('a'), f.col('b').desc())) @pytest.mark.parametrize('data_gen', single_level_array_gens, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_multi_orderby_on_array(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark : binary_op_df(spark, data_gen).orderBy(f.col('a'), f.col('b').desc())) @@ -208,7 +208,7 @@ def test_multi_orderby_on_array(data_gen): # SPARK CPU itself has issue with negative scale for take ordered and project orderable_gens_sort_without_neg_decimal = [n for n in orderable_gens_sort if not (isinstance(n, DecimalGen) and n.scale < 0)] @pytest.mark.parametrize('data_gen', orderable_gens_sort_without_neg_decimal + single_level_array_gens, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_multi_orderby_with_limit(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark : binary_op_df(spark, data_gen).orderBy(f.col('a'), f.col('b').desc()).limit(100)) @@ -216,7 +216,7 @@ def test_multi_orderby_with_limit(data_gen): # We added in a partitioning optimization to take_ordered_and_project # This should trigger it. @pytest.mark.parametrize('data_gen', orderable_gens_sort_without_neg_decimal + single_level_array_gens, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_multi_orderby_with_limit_single_part(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark : binary_op_df(spark, data_gen).coalesce(1).orderBy(f.col('a'), f.col('b').desc()).limit(100)) @@ -261,7 +261,7 @@ def test_single_orderby_with_skew(data_gen): # We are not trying all possibilities, just doing a few with numbers so the query works. @pytest.mark.parametrize('data_gen', [all_basic_struct_gen, StructGen([['child0', all_basic_struct_gen]])], ids=idfn) @pytest.mark.parametrize('stable_sort', ['STABLE', 'OUTOFCORE'], ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_single_nested_orderby_with_skew(data_gen, stable_sort): sort_conf = {'spark.rapids.sql.stableSort.enabled': stable_sort == 'STABLE'} # When doing range partitioning the upstream data is sampled to try and get the bounds for cutoffs. @@ -305,7 +305,7 @@ def test_large_orderby(data_gen, stable_sort): simple_string_to_string_map_gen, ArrayGen(byte_gen, max_length=5)], ids=idfn) @pytest.mark.order(2) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_large_orderby_nested_ridealong(data_gen): # We use a UniqueLongGen to avoid duplicate keys that can cause ambiguity in the sort # results, especially on distributed clusters. @@ -326,7 +326,7 @@ def test_large_orderby_nested_ridealong(data_gen): ArrayGen(byte_gen, max_length=5), ArrayGen(decimal_gen_128bit, max_length=5)], ids=idfn) @pytest.mark.order(2) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_orderby_nested_ridealong_limit(data_gen): # We use a UniqueLongGen to avoid duplicate keys that can cause ambiguity in the sort # results, especially on distributed clusters. diff --git a/integration_tests/src/main/python/window_function_test.py b/integration_tests/src/main/python/window_function_test.py index 81ce8d749484..681f7d3d6f1b 100644 --- a/integration_tests/src/main/python/window_function_test.py +++ b/integration_tests/src/main/python/window_function_test.py @@ -523,7 +523,7 @@ def test_rows_based_running_window_unpartitioned(b_gen, batch_size): @pytest.mark.parametrize('batch_size', ['1000', '1g'], ids=idfn) # Testing multiple batch sizes. @pytest.mark.parametrize('a_gen', integral_gens + [string_gen, date_gen, timestamp_gen], ids=meta_idfn('data:')) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_running_window_without_partitions_runs_batched(a_gen, batch_size): """ This tests the running window optimization as applied to RANGE-based window specifications, @@ -647,7 +647,7 @@ def test_running_window_float_sum_without_partitions_runs_batched(batch_size): @pytest.mark.parametrize('data_gen', all_basic_gens + [decimal_gen_32bit, orderable_decimal_gen_128bit], ids=meta_idfn('data:')) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_window_running_rank_no_part(data_gen): # Keep the batch size small. We have tested these with operators with exact inputs already, this is mostly # testing the fixup operation. @@ -675,7 +675,7 @@ def test_window_running_rank_no_part(data_gen): # but small batch sizes can make sort very slow, so do the final order by locally @ignore_order(local=True) @pytest.mark.parametrize('data_gen', all_basic_gens + [decimal_gen_32bit], ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_window_running_rank(data_gen): # Keep the batch size small. We have tested these with operators with exact inputs already, this is mostly # testing the fixup operation. @@ -703,7 +703,7 @@ def test_window_running_rank(data_gen): @pytest.mark.parametrize('batch_size', ['1000', '1g'], ids=idfn) # set the batch size so we can test multiple stream batches @pytest.mark.parametrize('b_gen, c_gen', [(long_gen, x) for x in running_part_and_order_gens] + [(x, long_gen) for x in all_basic_gens + [decimal_gen_32bit]], ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_rows_based_running_window_partitioned(b_gen, c_gen, batch_size): conf = {'spark.rapids.sql.batchSizeBytes': batch_size, 'spark.rapids.sql.variableFloatAgg.enabled': True, @@ -743,7 +743,7 @@ def test_rows_based_running_window_partitioned(b_gen, c_gen, batch_size): @pytest.mark.parametrize('batch_size', ['1000', '1g'], ids=idfn) # Test different batch sizes. @pytest.mark.parametrize('part_gen', [int_gen, long_gen], ids=idfn) # Partitioning is not really the focus of the test. @pytest.mark.parametrize('order_gen', [x for x in all_basic_gens_no_null if x not in boolean_gens] + [decimal_gen_32bit], ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_range_running_window_runs_batched(part_gen, order_gen, batch_size): """ This tests the running window optimization as applied to RANGE-based window specifications, @@ -887,7 +887,7 @@ def window(oby_column): @pytest.mark.parametrize('batch_size', ['1000', '1g'], ids=idfn) # set the batch size so we can test multiple stream batches @pytest.mark.parametrize('c_gen', lead_lag_data_gens, ids=idfn) @pytest.mark.parametrize('a_b_gen', part_and_order_gens, ids=meta_idfn('partAndOrderBy:')) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_multi_types_window_aggs_for_rows_lead_lag(a_b_gen, c_gen, batch_size): conf = {'spark.rapids.sql.batchSizeBytes': batch_size} data_gen = [ @@ -945,7 +945,7 @@ def do_it(spark): @approximate_float @pytest.mark.parametrize('struct_gen', lead_lag_struct_with_arrays_gen, ids=idfn) @pytest.mark.parametrize('a_b_gen', part_and_order_gens, ids=meta_idfn('partAndOrderBy:')) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_lead_lag_for_structs_with_arrays(a_b_gen, struct_gen): data_gen = [ ('a', RepeatSeqGen(a_b_gen, length=20)), @@ -979,7 +979,7 @@ def do_it(spark): @pytest.mark.parametrize('c_gen', [UniqueLongGen()], ids=meta_idfn('orderBy:')) @pytest.mark.parametrize('b_gen', [long_gen], ids=meta_idfn('orderBy:')) @pytest.mark.parametrize('a_gen', [long_gen], ids=meta_idfn('partBy:')) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_window_aggs_for_rows_lead_lag_on_arrays(a_gen, b_gen, c_gen, d_gen): data_gen = [ ('a', RepeatSeqGen(a_gen, length=20)), @@ -1009,7 +1009,7 @@ def test_window_aggs_for_rows_lead_lag_on_arrays(a_gen, b_gen, c_gen, d_gen): @approximate_float @pytest.mark.parametrize('c_gen', [string_gen], ids=idfn) @pytest.mark.parametrize('a_b_gen', part_and_order_gens, ids=meta_idfn('partAndOrderBy:')) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_multi_types_window_aggs_for_rows(a_b_gen, c_gen): data_gen = [ ('a', RepeatSeqGen(a_b_gen, length=20)), @@ -1061,13 +1061,12 @@ def do_it(spark): assert_gpu_and_cpu_are_equal_collect(do_it, conf = {'spark.rapids.sql.batchSizeBytes': '100'}) @pytest.mark.skipif(is_before_spark_320(), reason="Only in Spark 3.2.0 is IGNORE NULLS supported for lead and lag by Spark") -@allow_non_gpu('WindowExec', 'Alias', 'WindowExpression', 'Lead', 'Literal', 'WindowSpecDefinition', 'SpecifiedWindowFrame') +@allow_non_gpu('WindowExec', 'Alias', 'WindowExpression', 'Lead', 'Literal', 'WindowSpecDefinition', 'SpecifiedWindowFrame', *non_utc_allow) @ignore_order(local=True) @pytest.mark.parametrize('d_gen', all_basic_gens, ids=meta_idfn('agg:')) @pytest.mark.parametrize('c_gen', [UniqueLongGen()], ids=meta_idfn('orderBy:')) @pytest.mark.parametrize('b_gen', [long_gen], ids=meta_idfn('orderBy:')) @pytest.mark.parametrize('a_gen', [long_gen], ids=meta_idfn('partBy:')) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_window_aggs_lead_ignore_nulls_fallback(a_gen, b_gen, c_gen, d_gen): data_gen = [ ('a', RepeatSeqGen(a_gen, length=20)), @@ -1086,13 +1085,12 @@ def test_window_aggs_lead_ignore_nulls_fallback(a_gen, b_gen, c_gen, d_gen): ''') @pytest.mark.skipif(is_before_spark_320(), reason="Only in Spark 3.2.0 is IGNORE NULLS supported for lead and lag by Spark") -@allow_non_gpu('WindowExec', 'Alias', 'WindowExpression', 'Lag', 'Literal', 'WindowSpecDefinition', 'SpecifiedWindowFrame') +@allow_non_gpu('WindowExec', 'Alias', 'WindowExpression', 'Lag', 'Literal', 'WindowSpecDefinition', 'SpecifiedWindowFrame', *non_utc_allow) @ignore_order(local=True) @pytest.mark.parametrize('d_gen', all_basic_gens, ids=meta_idfn('agg:')) @pytest.mark.parametrize('c_gen', [UniqueLongGen()], ids=meta_idfn('orderBy:')) @pytest.mark.parametrize('b_gen', [long_gen], ids=meta_idfn('orderBy:')) @pytest.mark.parametrize('a_gen', [long_gen], ids=meta_idfn('partBy:')) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_window_aggs_lag_ignore_nulls_fallback(a_gen, b_gen, c_gen, d_gen): data_gen = [ ('a', RepeatSeqGen(a_gen, length=20)), @@ -1117,7 +1115,7 @@ def test_window_aggs_lag_ignore_nulls_fallback(a_gen, b_gen, c_gen, d_gen): @pytest.mark.parametrize('data_gen', [_grpkey_longs_with_timestamps, pytest.param(_grpkey_longs_with_nullable_timestamps)], ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_window_aggs_for_ranges_timestamps(data_gen): assert_gpu_and_cpu_are_equal_sql( lambda spark: gen_df(spark, data_gen, length=2048), @@ -1265,7 +1263,7 @@ def test_window_aggregations_for_big_decimal_ranges(data_gen): # SortExec does not support array type, so sort the result locally. @ignore_order(local=True) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_window_aggs_for_rows_collect_list(): assert_gpu_and_cpu_are_equal_sql( lambda spark : gen_df(spark, _gen_data_for_collect_list), @@ -1312,7 +1310,7 @@ def test_window_aggs_for_rows_collect_list(): @ignore_order(local=True) # This test is more directed at Databricks and their running window optimization instead of ours # this is why we do not validate that we inserted in a GpuRunningWindowExec, yet. -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_running_window_function_exec_for_all_aggs(): assert_gpu_and_cpu_are_equal_sql( lambda spark : gen_df(spark, _gen_data_for_collect_list), @@ -1421,7 +1419,7 @@ def do_it(spark): # SortExec does not support array type, so sort the result locally. @ignore_order(local=True) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_window_aggs_for_rows_collect_set(): assert_gpu_and_cpu_are_equal_sql( lambda spark: gen_df(spark, _gen_data_for_collect_set), @@ -1482,8 +1480,7 @@ def test_window_aggs_for_rows_collect_set(): # See https://github.com/NVIDIA/spark-rapids/issues/3715 # and https://github.com/rapidsai/cudf/issues/11222 @ignore_order(local=True) -@allow_non_gpu("ProjectExec", "SortArray") -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu("ProjectExec", "SortArray", *non_utc_allow) def test_window_aggs_for_rows_collect_set_nested_array(): conf = copy_and_update(_float_conf, { "spark.rapids.sql.castFloatToString.enabled": "true", @@ -1596,7 +1593,7 @@ def do_it(spark): # but small batch sizes can make sort very slow, so do the final order by locally @ignore_order(local=True) @pytest.mark.parametrize('ride_along', all_basic_gens + decimal_gens + array_gens_sample + struct_gens_sample + map_gens_sample, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_window_ride_along(ride_along): assert_gpu_and_cpu_are_equal_sql( lambda spark : gen_df(spark, [('a', UniqueLongGen()), ('b', ride_along)]), @@ -1672,7 +1669,7 @@ def test_unbounded_to_unbounded_window(): 'last(a) IGNORE NULLS OVER (PARTITION BY b ORDER BY c) ' @pytest.mark.parametrize('data_gen', all_basic_gens_no_null + decimal_gens + _nested_gens, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_window_first_last_nth(data_gen): assert_gpu_and_cpu_are_equal_sql( # Coalesce is to make sure that first and last, which are non-deterministic become deterministic @@ -1693,7 +1690,7 @@ def test_window_first_last_nth_ignore_nulls(data_gen): @ignore_order(local=True) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_to_date_with_window_functions(): """ This test ensures that date expressions participating alongside window aggregations