From 30a07c632e431902d7ffad6fc6f630f092fd45b8 Mon Sep 17 00:00:00 2001
From: Prince Datta <prince.datta@cloudsufi.com>
Date: Wed, 28 Feb 2024 10:16:54 +0530
Subject: [PATCH] Add presubmit tests for 2.2 and remove for 1.5

---
 cloudbuild/cloudbuild.yaml        | 66 +++++++++++++++----------------
 connectors/connectors.sh          |  2 +-
 dask/dask.sh                      |  4 +-
 hue/test_hue.py                   |  2 +
 kafka/test_kafka.py               |  4 +-
 livy/test_livy.py                 | 14 +++++--
 spark-rapids/test_spark_rapids.py |  9 +++++
 7 files changed, 60 insertions(+), 41 deletions(-)

diff --git a/cloudbuild/cloudbuild.yaml b/cloudbuild/cloudbuild.yaml
index c3ac41836..26e7caf27 100644
--- a/cloudbuild/cloudbuild.yaml
+++ b/cloudbuild/cloudbuild.yaml
@@ -9,39 +9,6 @@ steps:
     id: 'gcr-push'
     args: ['push', 'gcr.io/$PROJECT_ID/init-actions-image:$BUILD_ID']
 
-  # Run presubmit tests in parallel for 1.5 Debian image
-  - name: 'gcr.io/cloud-builders/kubectl'
-    id: 'dataproc-1.5-debian10-tests'
-    waitFor: ['gcr-push']
-    entrypoint: 'bash'
-    args: ['cloudbuild/run-presubmit-on-k8s.sh', 'gcr.io/$PROJECT_ID/init-actions-image:$BUILD_ID', '$BUILD_ID', '1.5-debian10']
-    env:
-      - 'COMMIT_SHA=$COMMIT_SHA'
-      - 'CLOUDSDK_COMPUTE_REGION=us-central1'
-      - 'CLOUDSDK_CONTAINER_CLUSTER=init-actions-presubmit'
-
-  # Run presubmit tests in parallel for 1.5 Rocky Linux image
-  - name: 'gcr.io/cloud-builders/kubectl'
-    id: 'dataproc-1.5-rocky8-tests'
-    waitFor: ['gcr-push']
-    entrypoint: 'bash'
-    args: ['cloudbuild/run-presubmit-on-k8s.sh', 'gcr.io/$PROJECT_ID/init-actions-image:$BUILD_ID', '$BUILD_ID', '1.5-rocky8']
-    env:
-      - 'COMMIT_SHA=$COMMIT_SHA'
-      - 'CLOUDSDK_COMPUTE_REGION=us-central1'
-      - 'CLOUDSDK_CONTAINER_CLUSTER=init-actions-presubmit'
-
-  # Run presubmit tests in parallel for 1.5 Ubuntu image
-  - name: 'gcr.io/cloud-builders/kubectl'
-    id: 'dataproc-1.5-ubuntu18-tests'
-    waitFor: ['gcr-push']
-    entrypoint: 'bash'
-    args: ['cloudbuild/run-presubmit-on-k8s.sh', 'gcr.io/$PROJECT_ID/init-actions-image:$BUILD_ID', '$BUILD_ID', '1.5-ubuntu18']
-    env:
-      - 'COMMIT_SHA=$COMMIT_SHA'
-      - 'CLOUDSDK_COMPUTE_REGION=us-central1'
-      - 'CLOUDSDK_CONTAINER_CLUSTER=init-actions-presubmit'
-
   # Run presubmit tests in parallel for 2.0 Debian image
   - name: 'gcr.io/cloud-builders/kubectl'
     id: 'dataproc-2.0-debian10-tests'
@@ -108,6 +75,39 @@ steps:
     - 'CLOUDSDK_COMPUTE_REGION=us-central1'
     - 'CLOUDSDK_CONTAINER_CLUSTER=init-actions-presubmit'
 
+  # Run presubmit tests in parallel for 2.2 Debian image
+  - name: 'gcr.io/cloud-builders/kubectl'
+    id: 'dataproc-2.2-debian12-tests'
+    waitFor: ['gcr-push']
+    entrypoint: 'bash'
+    args: ['cloudbuild/run-presubmit-on-k8s.sh', 'gcr.io/$PROJECT_ID/init-actions-image:$BUILD_ID', '$BUILD_ID', '2.2-debian12']
+    env:
+      - 'COMMIT_SHA=$COMMIT_SHA'
+      - 'CLOUDSDK_COMPUTE_REGION=us-central1'
+      - 'CLOUDSDK_CONTAINER_CLUSTER=init-actions-presubmit'
+
+  # Run presubmit tests in parallel for 2.2 Rocky Linux image
+  - name: 'gcr.io/cloud-builders/kubectl'
+    id: 'dataproc-2.2-rocky9-tests'
+    waitFor: ['gcr-push']
+    entrypoint: 'bash'
+    args: ['cloudbuild/run-presubmit-on-k8s.sh', 'gcr.io/$PROJECT_ID/init-actions-image:$BUILD_ID', '$BUILD_ID', '2.2-rocky9']
+    env:
+      - 'COMMIT_SHA=$COMMIT_SHA'
+      - 'CLOUDSDK_COMPUTE_REGION=us-central1'
+      - 'CLOUDSDK_CONTAINER_CLUSTER=init-actions-presubmit'
+
+  # Run presubmit tests in parallel for 2.2 Ubuntu image
+  - name: 'gcr.io/cloud-builders/kubectl'
+    id: 'dataproc-2.2-ubuntu22-tests'
+    waitFor: ['gcr-push']
+    entrypoint: 'bash'
+    args: ['cloudbuild/run-presubmit-on-k8s.sh', 'gcr.io/$PROJECT_ID/init-actions-image:$BUILD_ID', '$BUILD_ID', '2.2-ubuntu22']
+    env:
+      - 'COMMIT_SHA=$COMMIT_SHA'
+      - 'CLOUDSDK_COMPUTE_REGION=us-central1'
+      - 'CLOUDSDK_CONTAINER_CLUSTER=init-actions-presubmit'
+
   # Delete Docker image from GCR
   - name: 'gcr.io/cloud-builders/gcloud'
     args: ['container', 'images', 'delete', 'gcr.io/$PROJECT_ID/init-actions-image:$BUILD_ID']
diff --git a/connectors/connectors.sh b/connectors/connectors.sh
index a5a1697c2..22157dafa 100755
--- a/connectors/connectors.sh
+++ b/connectors/connectors.sh
@@ -66,7 +66,7 @@ function get_connector_url() {
           exit 1
         fi
         ;;
-      "1.5" | "2.0" | "2.1")
+      "1.5" | "2.0" | "2.1" | "2.2")
         scala_version="2.12"
         ;;
       *)
diff --git a/dask/dask.sh b/dask/dask.sh
index b9ec9e533..0a23e6a9f 100644
--- a/dask/dask.sh
+++ b/dask/dask.sh
@@ -43,7 +43,7 @@ readonly KNOX_HOME=/usr/lib/knox
 readonly KNOX_DASK_DIR=${KNOX_HOME}/data/services/dask/0.1.0
 readonly KNOX_DASKWS_DIR=${KNOX_HOME}/data/services/daskws/0.1.0
 
-CONDA_PACKAGES=("dask=${DASK_VERSION}" 'dask-bigquery' 'dask-ml' 'dask-sql')
+CONDA_PACKAGES=('dask-bigquery' 'dask-ml' 'dask-sql')
 
 if [[ "${DASK_RUNTIME}" == 'yarn' ]]; then
   # Pin `distributed` package version because `dask-yarn` 0.9
@@ -343,6 +343,8 @@ EOF
 
 
 function main() {
+  #Install dask with the help of conda as installing with mamba causes version conflicts
+  execute_with_retries "conda install -y dask=${DASK_VERSION}"
   # Install conda packages
   execute_with_retries "mamba install -y ${CONDA_PACKAGES[*]}"
 
diff --git a/hue/test_hue.py b/hue/test_hue.py
index b119dffcd..80a1128d4 100644
--- a/hue/test_hue.py
+++ b/hue/test_hue.py
@@ -60,6 +60,8 @@ def test_hue(self, configuration, machine_suffixes):
         'STANDARD',
     )
     def test_hue_job(self, configuration):
+        if self.getImageVersion() >= pkg_resources.parse_version("2.2"):
+            self.skipTest("Not supported in 2.2 image")
         self.createCluster(configuration, self.INIT_ACTIONS)
         self.__submit_pyspark_job(self.getClusterName())
 
diff --git a/kafka/test_kafka.py b/kafka/test_kafka.py
index 4d4703031..102d5957c 100644
--- a/kafka/test_kafka.py
+++ b/kafka/test_kafka.py
@@ -104,7 +104,7 @@ def test_kafka_manager(self, configuration, machine_suffixes):
         if self.getImageVersion() <= pkg_resources.parse_version("2.0"):
             self.skipTest("Java 11 or higher is required for CMAK")
 
-        metadata = 'run-on-master=true, kafka-enable-jmx=true'
+        metadata = 'run-on-master=true,kafka-enable-jmx=true'
         self.createCluster(configuration, self.KAFKA_MANAGER_INIT_ACTION, metadata=metadata)
         for machine_suffix in machine_suffixes:
             self.verify_instance("{}-{}".format(self.getClusterName(),
@@ -121,7 +121,7 @@ def test_kafka_manager_job(self, configuration):
         if self.getImageVersion() <= pkg_resources.parse_version("2.0"):
             self.skipTest("Java 11 or higher is required for CMAK")
 
-        metadata = 'run-on-master=true, kafka-enable-jmx=true, install-kafka-python=true'
+        metadata = 'run-on-master=true,kafka-enable-jmx=true,install-kafka-python=true'
         properties = 'dataproc:alpha.components=ZOOKEEPER'
         self.createCluster(configuration, self.KAFKA_MANAGER_INIT_ACTION, metadata=metadata,
                            properties=properties)
diff --git a/livy/test_livy.py b/livy/test_livy.py
index 5fb2bcd7a..3b4a57063 100644
--- a/livy/test_livy.py
+++ b/livy/test_livy.py
@@ -27,10 +27,16 @@ def _verify_instance(self, name):
         self.remove_test_script(self.TEST_SCRIPT_FILE_NAME, name)
 
     def _run_python_test_file(self, name):
-        self.assert_instance_command(
-            name,
-            "sudo apt-get install -y python3-pip && sudo pip3 install requests"
-        )
+        if self.getImageVersion() >= pkg_resources.parse_version("2.2"):
+            self.assert_instance_command(
+                name,
+                "sudo apt install python3-requests"
+            )
+        else:
+            self.assert_instance_command(
+                name,
+                "sudo apt-get install -y python3-pip && sudo pip3 install requests"
+            )
         self.assert_instance_command(
             name, "sudo python3 {}".format(self.TEST_SCRIPT_FILE_NAME))
 
diff --git a/spark-rapids/test_spark_rapids.py b/spark-rapids/test_spark_rapids.py
index 267e800b5..4bdf4a876 100644
--- a/spark-rapids/test_spark_rapids.py
+++ b/spark-rapids/test_spark_rapids.py
@@ -64,6 +64,9 @@ def test_spark_rapids(self, configuration, machine_suffixes, accelerator):
     if self.getImageVersion() == pkg_resources.parse_version("2.1") or self.getImageOs() == "rocky":
       self.skipTest("Not supported in image2.1 or rocky images")
 
+    if self.getImageVersion() == pkg_resources.parse_version("2.2") and self.getImageOs() == "debian":
+        self.skipTest("The Debian version (12) for Dataproc 2.2 is not supported")
+
     optional_components = None
     metadata = "gpu-driver-provider=NVIDIA,rapids-runtime=SPARK"
 
@@ -94,6 +97,9 @@ def test_spark_rapids_sql(self, configuration, machine_suffixes, accelerator):
     if self.getImageVersion() == pkg_resources.parse_version("2.1") or self.getImageOs() == "rocky":
       self.skipTest("Not supported in image2.1 or rocky images")
 
+    if self.getImageVersion() == pkg_resources.parse_version("2.2") and self.getImageOs() == "debian":
+        self.skipTest("The Debian version (12) for Dataproc 2.2 is not supported")
+
     optional_components = None
     metadata = "gpu-driver-provider=NVIDIA,rapids-runtime=SPARK"
 
@@ -124,6 +130,9 @@ def test_non_default_cuda_versions(self, configuration, machine_suffixes,
     if self.getImageVersion() == pkg_resources.parse_version("2.1") or self.getImageOs() == "rocky":
       self.skipTest("Not supported in image2.1 or rocky images")
 
+    if self.getImageVersion() == pkg_resources.parse_version("2.2") and self.getImageOs() == "debian":
+        self.skipTest("The Debian version (12) for Dataproc 2.2 is not supported")
+
     metadata = ("gpu-driver-provider=NVIDIA,rapids-runtime=SPARK"
                 ",cuda-version={0},driver-version={1}".format(cuda_version, driver_version))