From 2d0afde997bd5d810332f368ff67ac33742989b9 Mon Sep 17 00:00:00 2001 From: mingji Date: Wed, 8 Nov 2023 16:50:51 +0800 Subject: [PATCH 1/7] [CELEBORN-1116] Read authentication configs from hadoop_conf_dir --- README.md | 4 +++ .../apache/celeborn/common/CelebornConf.scala | 9 ------- .../common/util/CelebornHadoopUtils.scala | 27 ++++++++++--------- docs/configuration/master.md | 1 - docs/configuration/worker.md | 1 - docs/deploy.md | 4 +++ sbin/load-celeborn-env.sh | 2 -- 7 files changed, 23 insertions(+), 25 deletions(-) diff --git a/README.md b/README.md index 1820734be31..c66cdd1b6dd 100644 --- a/README.md +++ b/README.md @@ -136,6 +136,10 @@ celeborn.rpc.askTimeout 240s celeborn.worker.flusher.hdfs.buffer.size 4m celeborn.storage.hdfs.dir hdfs:///celeborn celeborn.worker.replicate.fastFail.duration 240s +# If you HDFS is enabled with Kerberos. +# You will need to set the following configs or use kinit to get valid TGT. +celeborn.storage.hdfs.kerberos.principal user@REALM +celeborn.storage.hdfs.kerberos.keytab /path/to/user.keytab # If your hosts have disk raid or use lvm, set celeborn.worker.monitor.disk.enabled to false celeborn.worker.monitor.disk.enabled false diff --git a/common/src/main/scala/org/apache/celeborn/common/CelebornConf.scala b/common/src/main/scala/org/apache/celeborn/common/CelebornConf.scala index 6923fea3f10..8a49bc8df98 100644 --- a/common/src/main/scala/org/apache/celeborn/common/CelebornConf.scala +++ b/common/src/main/scala/org/apache/celeborn/common/CelebornConf.scala @@ -1069,7 +1069,6 @@ class CelebornConf(loadDefaults: Boolean) extends Cloneable with Logging with Se // ////////////////////////////////////////////////////// // kerberos // // ////////////////////////////////////////////////////// - def hdfsStorageKerberosEnabled = get(HDFS_STORAGE_TYPE_KERBEROS_ENABLED) def hdfsStorageKerberosPrincipal = get(HDFS_STORAGE_KERBEROS_PRINCIPAL) def hdfsStorageKerberosKeytab = get(HDFS_STORAGE_KERBEROS_KEYTAB) } @@ -4017,14 +4016,6 @@ object CelebornConf extends Logging { .intConf .createWithDefault(64) - val HDFS_STORAGE_TYPE_KERBEROS_ENABLED: ConfigEntry[Boolean] = - buildConf("celeborn.storage.hdfs.kerberos.enabled") - .categories("master", "worker") - .version("0.3.2") - .doc("Whether to enable kerberos authentication for HDFS storage connection.") - .booleanConf - .createWithDefault(false) - val HDFS_STORAGE_KERBEROS_PRINCIPAL: OptionalConfigEntry[String] = buildConf("celeborn.storage.hdfs.kerberos.principal") .categories("master", "worker") diff --git a/common/src/main/scala/org/apache/celeborn/common/util/CelebornHadoopUtils.scala b/common/src/main/scala/org/apache/celeborn/common/util/CelebornHadoopUtils.scala index 1135d5bcbc7..da885410f81 100644 --- a/common/src/main/scala/org/apache/celeborn/common/util/CelebornHadoopUtils.scala +++ b/common/src/main/scala/org/apache/celeborn/common/util/CelebornHadoopUtils.scala @@ -80,18 +80,21 @@ object CelebornHadoopUtils extends Logging { // If we are accessing HDFS and it has Kerberos enabled, we have to login // from a keytab file so that we can access HDFS beyond the kerberos ticket expiration. UserGroupInformation.setConfiguration(hadoopConf) - if (conf.hdfsStorageKerberosEnabled) { - val principal = conf.hdfsStorageKerberosPrincipal - .getOrElse(throw new NoSuchElementException( - CelebornConf.HDFS_STORAGE_KERBEROS_PRINCIPAL.key)) - val keytab = conf.hdfsStorageKerberosKeytab - .getOrElse(throw new NoSuchElementException(CelebornConf.HDFS_STORAGE_KERBEROS_KEYTAB.key)) - if (!new File(keytab).exists()) { - throw new CelebornException(s"Keytab file: ${keytab} does not exist") - } else { - logInfo("Attempting to login to Kerberos " + - s"using principal: ${principal} and keytab: ${keytab}") - UserGroupInformation.loginUserFromKeytab(principal, keytab) + if ("kerberos".equals(hadoopConf.get("hadoop.security.authentication").toLowerCase)) { + val principalOpt = conf.hdfsStorageKerberosPrincipal + val keytabOpt = conf.hdfsStorageKerberosKeytab + (principalOpt, keytabOpt) match { + case (Some(principal), Some(keytab)) => + logInfo("Attempting to login to Kerberos " + + s"using principal: ${principal} and keytab: ${keytab}") + if (!new File(keytab).exists()) { + throw new CelebornException(s"Keytab file: ${keytab} does not exist") + } + UserGroupInformation.loginUserFromKeytab(principal, keytab) + case _ => + logInfo("Kerberos is enabled without principal and keytab supplied," + + " assuming keytab is managed externally") + UserGroupInformation.getCurrentUser() } } } diff --git a/docs/configuration/master.md b/docs/configuration/master.md index 291e5643567..c21a76edde5 100644 --- a/docs/configuration/master.md +++ b/docs/configuration/master.md @@ -40,7 +40,6 @@ license: | | celeborn.master.workerUnavailableInfo.expireTimeout | 1800s | Worker unavailable info would be cleared when the retention period is expired | 0.3.1 | | celeborn.storage.availableTypes | HDD | Enabled storages. Available options: MEMORY,HDD,SSD,HDFS. Note: HDD and SSD would be treated as identical. | 0.3.0 | | celeborn.storage.hdfs.dir | <undefined> | HDFS base directory for Celeborn to store shuffle data. | 0.2.0 | -| celeborn.storage.hdfs.kerberos.enabled | false | Whether to enable kerberos authentication for HDFS storage connection. | 0.3.2 | | celeborn.storage.hdfs.kerberos.keytab | <undefined> | Kerberos keytab file path for HDFS storage connection. | 0.3.2 | | celeborn.storage.hdfs.kerberos.principal | <undefined> | Kerberos principal for HDFS storage connection. | 0.3.2 | diff --git a/docs/configuration/worker.md b/docs/configuration/worker.md index 965bf320c62..97c9809f4d0 100644 --- a/docs/configuration/worker.md +++ b/docs/configuration/worker.md @@ -24,7 +24,6 @@ license: | | celeborn.shuffle.chunk.size | 8m | Max chunk size of reducer's merged shuffle data. For example, if a reducer's shuffle data is 128M and the data will need 16 fetch chunk requests to fetch. | 0.2.0 | | celeborn.storage.availableTypes | HDD | Enabled storages. Available options: MEMORY,HDD,SSD,HDFS. Note: HDD and SSD would be treated as identical. | 0.3.0 | | celeborn.storage.hdfs.dir | <undefined> | HDFS base directory for Celeborn to store shuffle data. | 0.2.0 | -| celeborn.storage.hdfs.kerberos.enabled | false | Whether to enable kerberos authentication for HDFS storage connection. | 0.3.2 | | celeborn.storage.hdfs.kerberos.keytab | <undefined> | Kerberos keytab file path for HDFS storage connection. | 0.3.2 | | celeborn.storage.hdfs.kerberos.principal | <undefined> | Kerberos principal for HDFS storage connection. | 0.3.2 | | celeborn.worker.activeConnection.max | <undefined> | If the number of active connections on a worker exceeds this configuration value, the worker will be marked as high-load in the heartbeat report, and the master will not include that node in the response of RequestSlots. | 0.3.1 | diff --git a/docs/deploy.md b/docs/deploy.md index b91f74ced53..9270b4ff5bb 100644 --- a/docs/deploy.md +++ b/docs/deploy.md @@ -58,6 +58,10 @@ celeborn.rpc.askTimeout 240s celeborn.worker.flusher.hdfs.buffer.size 4m celeborn.storage.hdfs.dir hdfs:///celeborn celeborn.worker.replicate.fastFail.duration 240s +# If you HDFS is enabled with Kerberos. +# You will need to set the following configs or use kinit to get valid TGT. +celeborn.storage.hdfs.kerberos.principal user@REALM +celeborn.storage.hdfs.kerberos.keytab /path/to/user.keytab # If your hosts have disk raid or use lvm, set celeborn.worker.monitor.disk.enabled to false celeborn.worker.monitor.disk.enabled false diff --git a/sbin/load-celeborn-env.sh b/sbin/load-celeborn-env.sh index 7389dc87e40..e446331207b 100755 --- a/sbin/load-celeborn-env.sh +++ b/sbin/load-celeborn-env.sh @@ -16,8 +16,6 @@ # limitations under the License. # -unset HADOOP_CONF_DIR - # included in all the celeborn scripts with source command # should not be executable directly # also should not be passed any arguments, since we need original $* From c7c478fa25c791bfa309e9c0b5a07e6d7a987309 Mon Sep 17 00:00:00 2001 From: mingji Date: Wed, 8 Nov 2023 20:56:42 +0800 Subject: [PATCH 2/7] remove unused comments --- .../org/apache/celeborn/common/util/CelebornHadoopUtils.scala | 2 -- 1 file changed, 2 deletions(-) diff --git a/common/src/main/scala/org/apache/celeborn/common/util/CelebornHadoopUtils.scala b/common/src/main/scala/org/apache/celeborn/common/util/CelebornHadoopUtils.scala index da885410f81..03b4e9899d0 100644 --- a/common/src/main/scala/org/apache/celeborn/common/util/CelebornHadoopUtils.scala +++ b/common/src/main/scala/org/apache/celeborn/common/util/CelebornHadoopUtils.scala @@ -77,8 +77,6 @@ object CelebornHadoopUtils extends Logging { } def initKerberos(conf: CelebornConf, hadoopConf: Configuration): Unit = { - // If we are accessing HDFS and it has Kerberos enabled, we have to login - // from a keytab file so that we can access HDFS beyond the kerberos ticket expiration. UserGroupInformation.setConfiguration(hadoopConf) if ("kerberos".equals(hadoopConf.get("hadoop.security.authentication").toLowerCase)) { val principalOpt = conf.hdfsStorageKerberosPrincipal From 730f1456f2b7f074349a766813ed491f83102d52 Mon Sep 17 00:00:00 2001 From: Cheng Pan Date: Wed, 8 Nov 2023 22:07:22 +0800 Subject: [PATCH 3/7] Update common/src/main/scala/org/apache/celeborn/common/util/CelebornHadoopUtils.scala --- .../org/apache/celeborn/common/util/CelebornHadoopUtils.scala | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/common/src/main/scala/org/apache/celeborn/common/util/CelebornHadoopUtils.scala b/common/src/main/scala/org/apache/celeborn/common/util/CelebornHadoopUtils.scala index 03b4e9899d0..3e4e766fb3f 100644 --- a/common/src/main/scala/org/apache/celeborn/common/util/CelebornHadoopUtils.scala +++ b/common/src/main/scala/org/apache/celeborn/common/util/CelebornHadoopUtils.scala @@ -79,9 +79,7 @@ object CelebornHadoopUtils extends Logging { def initKerberos(conf: CelebornConf, hadoopConf: Configuration): Unit = { UserGroupInformation.setConfiguration(hadoopConf) if ("kerberos".equals(hadoopConf.get("hadoop.security.authentication").toLowerCase)) { - val principalOpt = conf.hdfsStorageKerberosPrincipal - val keytabOpt = conf.hdfsStorageKerberosKeytab - (principalOpt, keytabOpt) match { + (conf.hdfsStorageKerberosPrincipal, conf.hdfsStorageKerberosKeytab) match { case (Some(principal), Some(keytab)) => logInfo("Attempting to login to Kerberos " + s"using principal: ${principal} and keytab: ${keytab}") From 588b30297c7ca4a215656f44d44b84c30a6404ad Mon Sep 17 00:00:00 2001 From: Ethan Feng Date: Thu, 9 Nov 2023 10:16:43 +0800 Subject: [PATCH 4/7] Update docs/deploy.md Co-authored-by: Cheng Pan --- docs/deploy.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/docs/deploy.md b/docs/deploy.md index 9270b4ff5bb..c289b57bb3f 100644 --- a/docs/deploy.md +++ b/docs/deploy.md @@ -58,8 +58,7 @@ celeborn.rpc.askTimeout 240s celeborn.worker.flusher.hdfs.buffer.size 4m celeborn.storage.hdfs.dir hdfs:///celeborn celeborn.worker.replicate.fastFail.duration 240s -# If you HDFS is enabled with Kerberos. -# You will need to set the following configs or use kinit to get valid TGT. +# Either principal/keytab or valid TGT cache is required to access kerberized HDFS celeborn.storage.hdfs.kerberos.principal user@REALM celeborn.storage.hdfs.kerberos.keytab /path/to/user.keytab From 884eca6566a163f2958a2aca6949db18cb7b7dcf Mon Sep 17 00:00:00 2001 From: mingji Date: Thu, 9 Nov 2023 10:17:43 +0800 Subject: [PATCH 5/7] update docs --- README.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/README.md b/README.md index c66cdd1b6dd..e1d06d177d4 100644 --- a/README.md +++ b/README.md @@ -136,8 +136,7 @@ celeborn.rpc.askTimeout 240s celeborn.worker.flusher.hdfs.buffer.size 4m celeborn.storage.hdfs.dir hdfs:///celeborn celeborn.worker.replicate.fastFail.duration 240s -# If you HDFS is enabled with Kerberos. -# You will need to set the following configs or use kinit to get valid TGT. +# Either principal/keytab or valid TGT cache is required to access kerberized HDFS celeborn.storage.hdfs.kerberos.principal user@REALM celeborn.storage.hdfs.kerberos.keytab /path/to/user.keytab From 19d92e1db83144ee09510089a5495d28115beab7 Mon Sep 17 00:00:00 2001 From: Fu Chen Date: Thu, 9 Nov 2023 10:35:11 +0800 Subject: [PATCH 6/7] Update common/src/main/scala/org/apache/celeborn/common/util/CelebornHadoopUtils.scala --- .../org/apache/celeborn/common/util/CelebornHadoopUtils.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/common/src/main/scala/org/apache/celeborn/common/util/CelebornHadoopUtils.scala b/common/src/main/scala/org/apache/celeborn/common/util/CelebornHadoopUtils.scala index 3e4e766fb3f..3ec0bda945b 100644 --- a/common/src/main/scala/org/apache/celeborn/common/util/CelebornHadoopUtils.scala +++ b/common/src/main/scala/org/apache/celeborn/common/util/CelebornHadoopUtils.scala @@ -81,8 +81,8 @@ object CelebornHadoopUtils extends Logging { if ("kerberos".equals(hadoopConf.get("hadoop.security.authentication").toLowerCase)) { (conf.hdfsStorageKerberosPrincipal, conf.hdfsStorageKerberosKeytab) match { case (Some(principal), Some(keytab)) => - logInfo("Attempting to login to Kerberos " + - s"using principal: ${principal} and keytab: ${keytab}") + logInfo( + s"Attempting to login to Kerberos using principal: $principal and keytab: $keytab") if (!new File(keytab).exists()) { throw new CelebornException(s"Keytab file: ${keytab} does not exist") } From 2bfc9789e7d90450e7f68dafe3b45e4828a6cf57 Mon Sep 17 00:00:00 2001 From: Fu Chen Date: Thu, 9 Nov 2023 10:35:21 +0800 Subject: [PATCH 7/7] Update common/src/main/scala/org/apache/celeborn/common/util/CelebornHadoopUtils.scala --- .../org/apache/celeborn/common/util/CelebornHadoopUtils.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common/src/main/scala/org/apache/celeborn/common/util/CelebornHadoopUtils.scala b/common/src/main/scala/org/apache/celeborn/common/util/CelebornHadoopUtils.scala index 3ec0bda945b..ae07187e7f3 100644 --- a/common/src/main/scala/org/apache/celeborn/common/util/CelebornHadoopUtils.scala +++ b/common/src/main/scala/org/apache/celeborn/common/util/CelebornHadoopUtils.scala @@ -84,7 +84,7 @@ object CelebornHadoopUtils extends Logging { logInfo( s"Attempting to login to Kerberos using principal: $principal and keytab: $keytab") if (!new File(keytab).exists()) { - throw new CelebornException(s"Keytab file: ${keytab} does not exist") + throw new CelebornException(s"Keytab file: $keytab does not exist") } UserGroupInformation.loginUserFromKeytab(principal, keytab) case _ =>