From 2ee95cc6f265d41e896cd3516d0964f1f3545728 Mon Sep 17 00:00:00 2001 From: mingji Date: Thu, 9 Nov 2023 11:07:13 +0800 Subject: [PATCH] [CELEBORN-1116] Read authentication configs from `HADOOP_CONF_DIR` ### What changes were proposed in this pull request? 1. Make Celeborn read configs from HADOOP_COND_DIR. 2. Remove unnecessary Kerberos configs. ### Why are the changes needed? To support HDFS with Kerberos. ### Does this PR introduce _any_ user-facing change? NO. ### How was this patch tested? GA and cluster. Closes #2082 from FMX/B1116. Lead-authored-by: mingji Co-authored-by: Fu Chen Co-authored-by: Cheng Pan Co-authored-by: Ethan Feng Signed-off-by: zky.zhoukeyong --- README.md | 3 +++ .../apache/celeborn/common/CelebornConf.scala | 9 ------- .../common/util/CelebornHadoopUtils.scala | 27 +++++++++---------- docs/configuration/master.md | 1 - docs/configuration/worker.md | 1 - docs/deploy.md | 3 +++ sbin/load-celeborn-env.sh | 2 -- 7 files changed, 19 insertions(+), 27 deletions(-) diff --git a/README.md b/README.md index 6038ba2508b..108558fbd27 100644 --- a/README.md +++ b/README.md @@ -130,6 +130,9 @@ celeborn.rpc.askTimeout 240s celeborn.worker.flusher.hdfs.buffer.size 4m celeborn.storage.hdfs.dir hdfs:///celeborn celeborn.worker.replicate.fastFail.duration 240s +# Either principal/keytab or valid TGT cache is required to access kerberized HDFS +celeborn.storage.hdfs.kerberos.principal user@REALM +celeborn.storage.hdfs.kerberos.keytab /path/to/user.keytab # If your hosts have disk raid or use lvm, set celeborn.worker.monitor.disk.enabled to false celeborn.worker.monitor.disk.enabled false diff --git a/common/src/main/scala/org/apache/celeborn/common/CelebornConf.scala b/common/src/main/scala/org/apache/celeborn/common/CelebornConf.scala index 9bdbfbe8227..ef7b63567d0 100644 --- a/common/src/main/scala/org/apache/celeborn/common/CelebornConf.scala +++ b/common/src/main/scala/org/apache/celeborn/common/CelebornConf.scala @@ -1066,7 +1066,6 @@ class CelebornConf(loadDefaults: Boolean) extends Cloneable with Logging with Se // ////////////////////////////////////////////////////// // kerberos // // ////////////////////////////////////////////////////// - def hdfsStorageKerberosEnabled = get(HDFS_STORAGE_TYPE_KERBEROS_ENABLED) def hdfsStorageKerberosPrincipal = get(HDFS_STORAGE_KERBEROS_PRINCIPAL) def hdfsStorageKerberosKeytab = get(HDFS_STORAGE_KERBEROS_KEYTAB) } @@ -3993,14 +3992,6 @@ object CelebornConf extends Logging { .intConf .createWithDefault(64) - val HDFS_STORAGE_TYPE_KERBEROS_ENABLED: ConfigEntry[Boolean] = - buildConf("celeborn.storage.hdfs.kerberos.enabled") - .categories("master", "worker") - .version("0.3.2") - .doc("Whether to enable kerberos authentication for HDFS storage connection.") - .booleanConf - .createWithDefault(false) - val HDFS_STORAGE_KERBEROS_PRINCIPAL: OptionalConfigEntry[String] = buildConf("celeborn.storage.hdfs.kerberos.principal") .categories("master", "worker") diff --git a/common/src/main/scala/org/apache/celeborn/common/util/CelebornHadoopUtils.scala b/common/src/main/scala/org/apache/celeborn/common/util/CelebornHadoopUtils.scala index 1135d5bcbc7..ae07187e7f3 100644 --- a/common/src/main/scala/org/apache/celeborn/common/util/CelebornHadoopUtils.scala +++ b/common/src/main/scala/org/apache/celeborn/common/util/CelebornHadoopUtils.scala @@ -77,21 +77,20 @@ object CelebornHadoopUtils extends Logging { } def initKerberos(conf: CelebornConf, hadoopConf: Configuration): Unit = { - // If we are accessing HDFS and it has Kerberos enabled, we have to login - // from a keytab file so that we can access HDFS beyond the kerberos ticket expiration. UserGroupInformation.setConfiguration(hadoopConf) - if (conf.hdfsStorageKerberosEnabled) { - val principal = conf.hdfsStorageKerberosPrincipal - .getOrElse(throw new NoSuchElementException( - CelebornConf.HDFS_STORAGE_KERBEROS_PRINCIPAL.key)) - val keytab = conf.hdfsStorageKerberosKeytab - .getOrElse(throw new NoSuchElementException(CelebornConf.HDFS_STORAGE_KERBEROS_KEYTAB.key)) - if (!new File(keytab).exists()) { - throw new CelebornException(s"Keytab file: ${keytab} does not exist") - } else { - logInfo("Attempting to login to Kerberos " + - s"using principal: ${principal} and keytab: ${keytab}") - UserGroupInformation.loginUserFromKeytab(principal, keytab) + if ("kerberos".equals(hadoopConf.get("hadoop.security.authentication").toLowerCase)) { + (conf.hdfsStorageKerberosPrincipal, conf.hdfsStorageKerberosKeytab) match { + case (Some(principal), Some(keytab)) => + logInfo( + s"Attempting to login to Kerberos using principal: $principal and keytab: $keytab") + if (!new File(keytab).exists()) { + throw new CelebornException(s"Keytab file: $keytab does not exist") + } + UserGroupInformation.loginUserFromKeytab(principal, keytab) + case _ => + logInfo("Kerberos is enabled without principal and keytab supplied," + + " assuming keytab is managed externally") + UserGroupInformation.getCurrentUser() } } } diff --git a/docs/configuration/master.md b/docs/configuration/master.md index 12a643eb4d8..1474f08cb4a 100644 --- a/docs/configuration/master.md +++ b/docs/configuration/master.md @@ -38,7 +38,6 @@ license: | | celeborn.master.workerUnavailableInfo.expireTimeout | 1800s | Worker unavailable info would be cleared when the retention period is expired | 0.3.1 | | celeborn.storage.availableTypes | HDD | Enabled storages. Available options: MEMORY,HDD,SSD,HDFS. Note: HDD and SSD would be treated as identical. | 0.3.0 | | celeborn.storage.hdfs.dir | <undefined> | HDFS base directory for Celeborn to store shuffle data. | 0.2.0 | -| celeborn.storage.hdfs.kerberos.enabled | false | Whether to enable kerberos authentication for HDFS storage connection. | 0.3.2 | | celeborn.storage.hdfs.kerberos.keytab | <undefined> | Kerberos keytab file path for HDFS storage connection. | 0.3.2 | | celeborn.storage.hdfs.kerberos.principal | <undefined> | Kerberos principal for HDFS storage connection. | 0.3.2 | diff --git a/docs/configuration/worker.md b/docs/configuration/worker.md index 1de42852931..ac922fe16b0 100644 --- a/docs/configuration/worker.md +++ b/docs/configuration/worker.md @@ -24,7 +24,6 @@ license: | | celeborn.shuffle.chunk.size | 8m | Max chunk size of reducer's merged shuffle data. For example, if a reducer's shuffle data is 128M and the data will need 16 fetch chunk requests to fetch. | 0.2.0 | | celeborn.storage.availableTypes | HDD | Enabled storages. Available options: MEMORY,HDD,SSD,HDFS. Note: HDD and SSD would be treated as identical. | 0.3.0 | | celeborn.storage.hdfs.dir | <undefined> | HDFS base directory for Celeborn to store shuffle data. | 0.2.0 | -| celeborn.storage.hdfs.kerberos.enabled | false | Whether to enable kerberos authentication for HDFS storage connection. | 0.3.2 | | celeborn.storage.hdfs.kerberos.keytab | <undefined> | Kerberos keytab file path for HDFS storage connection. | 0.3.2 | | celeborn.storage.hdfs.kerberos.principal | <undefined> | Kerberos principal for HDFS storage connection. | 0.3.2 | | celeborn.worker.activeConnection.max | <undefined> | If the number of active connections on a worker exceeds this configuration value, the worker will be marked as high-load in the heartbeat report, and the master will not include that node in the response of RequestSlots. | 0.3.1 | diff --git a/docs/deploy.md b/docs/deploy.md index bafa9691a73..9019c879058 100644 --- a/docs/deploy.md +++ b/docs/deploy.md @@ -56,6 +56,9 @@ celeborn.rpc.askTimeout 240s celeborn.worker.flusher.hdfs.buffer.size 4m celeborn.storage.hdfs.dir hdfs:///celeborn celeborn.worker.replicate.fastFail.duration 240s +# Either principal/keytab or valid TGT cache is required to access kerberized HDFS +celeborn.storage.hdfs.kerberos.principal user@REALM +celeborn.storage.hdfs.kerberos.keytab /path/to/user.keytab # If your hosts have disk raid or use lvm, set celeborn.worker.monitor.disk.enabled to false celeborn.worker.monitor.disk.enabled false diff --git a/sbin/load-celeborn-env.sh b/sbin/load-celeborn-env.sh index 7389dc87e40..e446331207b 100755 --- a/sbin/load-celeborn-env.sh +++ b/sbin/load-celeborn-env.sh @@ -16,8 +16,6 @@ # limitations under the License. # -unset HADOOP_CONF_DIR - # included in all the celeborn scripts with source command # should not be executable directly # also should not be passed any arguments, since we need original $*