-
Notifications
You must be signed in to change notification settings - Fork 374
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge branch 'main' into CELEBORN-1122
- Loading branch information
Showing
181 changed files
with
6,191 additions
and
853 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
87 changes: 87 additions & 0 deletions
87
assets/spark-patch/Celeborn_Dynamic_Allocation_spark3_0.patch
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,87 @@ | ||
# Licensed to the Apache Software Foundation (ASF) under one or more | ||
# contributor license agreements. See the NOTICE file distributed with | ||
# this work for additional information regarding copyright ownership. | ||
# The ASF licenses this file to You under the Apache License, Version 2.0 | ||
# (the "License"); you may not use this file except in compliance with | ||
# the License. You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
Subject: [PATCH] [CORE][SHUFFLE] Support enabling DRA with Apache Celeborn | ||
--- | ||
Index: core/src/main/scala/org/apache/spark/ExecutorAllocationManager.scala | ||
IDEA additional info: | ||
Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP | ||
<+>UTF-8 | ||
=================================================================== | ||
diff --git a/core/src/main/scala/org/apache/spark/ExecutorAllocationManager.scala b/core/src/main/scala/org/apache/spark/ExecutorAllocationManager.scala | ||
--- a/core/src/main/scala/org/apache/spark/ExecutorAllocationManager.scala (revision 65ac1e75dc468f53fc778cd2ce1ba3f21067aab8) | ||
+++ b/core/src/main/scala/org/apache/spark/ExecutorAllocationManager.scala (revision 1f5cb2ec9e9652d03c9775954b69a708b5d05ab3) | ||
@@ -198,7 +198,7 @@ | ||
if (!conf.get(config.SHUFFLE_SERVICE_ENABLED)) { | ||
if (conf.get(config.DYN_ALLOCATION_SHUFFLE_TRACKING_ENABLED)) { | ||
logWarning("Dynamic allocation without a shuffle service is an experimental feature.") | ||
- } else if (!testing) { | ||
+ } else if (!testing && !Utils.isCelebornEnabled(conf)) { | ||
throw new SparkException("Dynamic allocation of executors requires the external " + | ||
"shuffle service. You may enable this through spark.shuffle.service.enabled.") | ||
} | ||
Index: core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala | ||
IDEA additional info: | ||
Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP | ||
<+>UTF-8 | ||
=================================================================== | ||
diff --git a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala | ||
--- a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala (revision 65ac1e75dc468f53fc778cd2ce1ba3f21067aab8) | ||
+++ b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala (revision 1f5cb2ec9e9652d03c9775954b69a708b5d05ab3) | ||
@@ -1851,7 +1851,8 @@ | ||
// if the cluster manager explicitly tells us that the entire worker was lost, then | ||
// we know to unregister shuffle output. (Note that "worker" specifically refers to the process | ||
// from a Standalone cluster, where the shuffle service lives in the Worker.) | ||
- val fileLost = workerLost || !env.blockManager.externalShuffleServiceEnabled | ||
+ val fileLost = !Utils.isCelebornEnabled(sc.getConf) && | ||
+ (workerLost || !env.blockManager.externalShuffleServiceEnabled) | ||
removeExecutorAndUnregisterOutputs( | ||
execId = execId, | ||
fileLost = fileLost, | ||
Index: core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala | ||
IDEA additional info: | ||
Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP | ||
<+>UTF-8 | ||
=================================================================== | ||
diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala | ||
--- a/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala (revision 65ac1e75dc468f53fc778cd2ce1ba3f21067aab8) | ||
+++ b/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala (revision 1f5cb2ec9e9652d03c9775954b69a708b5d05ab3) | ||
@@ -944,7 +944,7 @@ | ||
// The reason is the next stage wouldn't be able to fetch the data from this dead executor | ||
// so we would need to rerun these tasks on other executors. | ||
if (tasks(0).isInstanceOf[ShuffleMapTask] && !env.blockManager.externalShuffleServiceEnabled | ||
- && !isZombie) { | ||
+ && !isZombie && !Utils.isCelebornEnabled(conf)) { | ||
for ((tid, info) <- taskInfos if info.executorId == execId) { | ||
val index = taskInfos(tid).index | ||
// We may have a running task whose partition has been marked as successful, | ||
Index: core/src/main/scala/org/apache/spark/util/Utils.scala | ||
IDEA additional info: | ||
Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP | ||
<+>UTF-8 | ||
=================================================================== | ||
diff --git a/core/src/main/scala/org/apache/spark/util/Utils.scala b/core/src/main/scala/org/apache/spark/util/Utils.scala | ||
--- a/core/src/main/scala/org/apache/spark/util/Utils.scala (revision 65ac1e75dc468f53fc778cd2ce1ba3f21067aab8) | ||
+++ b/core/src/main/scala/org/apache/spark/util/Utils.scala (revision 1f5cb2ec9e9652d03c9775954b69a708b5d05ab3) | ||
@@ -2934,6 +2934,9 @@ | ||
props.forEach((k, v) => resultProps.put(k, v)) | ||
resultProps | ||
} | ||
+ | ||
+ def isCelebornEnabled(conf: SparkConf): Boolean = | ||
+ conf.get("spark.shuffle.manager", "sort").contains("celeborn") | ||
} | ||
|
||
private[util] object CallerContext extends Logging { |
91 changes: 91 additions & 0 deletions
91
assets/spark-patch/Celeborn_Dynamic_Allocation_spark3_1.patch
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,91 @@ | ||
# Licensed to the Apache Software Foundation (ASF) under one or more | ||
# contributor license agreements. See the NOTICE file distributed with | ||
# this work for additional information regarding copyright ownership. | ||
# The ASF licenses this file to You under the Apache License, Version 2.0 | ||
# (the "License"); you may not use this file except in compliance with | ||
# the License. You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
Subject: [PATCH] [CORE][SHUFFLE] Support enabling DRA with Apache Celeborn | ||
--- | ||
Index: core/src/main/scala/org/apache/spark/ExecutorAllocationManager.scala | ||
IDEA additional info: | ||
Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP | ||
<+>UTF-8 | ||
=================================================================== | ||
diff --git a/core/src/main/scala/org/apache/spark/ExecutorAllocationManager.scala b/core/src/main/scala/org/apache/spark/ExecutorAllocationManager.scala | ||
--- a/core/src/main/scala/org/apache/spark/ExecutorAllocationManager.scala (revision d1f8a503a26bcfb4e466d9accc5fa241a7933667) | ||
+++ b/core/src/main/scala/org/apache/spark/ExecutorAllocationManager.scala (revision c8609fa08534e1cc6fd377b3573e5e65ddc79f15) | ||
@@ -210,7 +210,7 @@ | ||
(decommissionEnabled && | ||
conf.get(config.STORAGE_DECOMMISSION_SHUFFLE_BLOCKS_ENABLED))) { | ||
logWarning("Dynamic allocation without a shuffle service is an experimental feature.") | ||
- } else if (!testing) { | ||
+ } else if (!testing && !Utils.isCelebornEnabled(conf)) { | ||
throw new SparkException("Dynamic allocation of executors requires the external " + | ||
"shuffle service. You may enable this through spark.shuffle.service.enabled.") | ||
} | ||
Index: core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala | ||
IDEA additional info: | ||
Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP | ||
<+>UTF-8 | ||
=================================================================== | ||
diff --git a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala | ||
--- a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala (revision d1f8a503a26bcfb4e466d9accc5fa241a7933667) | ||
+++ b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala (revision c8609fa08534e1cc6fd377b3573e5e65ddc79f15) | ||
@@ -2080,7 +2080,8 @@ | ||
// if the cluster manager explicitly tells us that the entire worker was lost, then | ||
// we know to unregister shuffle output. (Note that "worker" specifically refers to the process | ||
// from a Standalone cluster, where the shuffle service lives in the Worker.) | ||
- val fileLost = workerHost.isDefined || !env.blockManager.externalShuffleServiceEnabled | ||
+ val fileLost = !Utils.isCelebornEnabled(sc.getConf) && | ||
+ (workerHost.isDefined || !env.blockManager.externalShuffleServiceEnabled) | ||
removeExecutorAndUnregisterOutputs( | ||
execId = execId, | ||
fileLost = fileLost, | ||
Index: core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala | ||
IDEA additional info: | ||
Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP | ||
<+>UTF-8 | ||
=================================================================== | ||
diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala | ||
--- a/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala (revision d1f8a503a26bcfb4e466d9accc5fa241a7933667) | ||
+++ b/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala (revision c8609fa08534e1cc6fd377b3573e5e65ddc79f15) | ||
@@ -973,7 +973,8 @@ | ||
// and we are not using an external shuffle server which could serve the shuffle outputs. | ||
// The reason is the next stage wouldn't be able to fetch the data from this dead executor | ||
// so we would need to rerun these tasks on other executors. | ||
- if (isShuffleMapTasks && !env.blockManager.externalShuffleServiceEnabled && !isZombie) { | ||
+ if (isShuffleMapTasks && !env.blockManager.externalShuffleServiceEnabled && !isZombie && | ||
+ !Utils.isCelebornEnabled(conf)) { | ||
for ((tid, info) <- taskInfos if info.executorId == execId) { | ||
val index = taskInfos(tid).index | ||
// We may have a running task whose partition has been marked as successful, | ||
Index: core/src/main/scala/org/apache/spark/util/Utils.scala | ||
IDEA additional info: | ||
Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP | ||
<+>UTF-8 | ||
=================================================================== | ||
diff --git a/core/src/main/scala/org/apache/spark/util/Utils.scala b/core/src/main/scala/org/apache/spark/util/Utils.scala | ||
--- a/core/src/main/scala/org/apache/spark/util/Utils.scala (revision d1f8a503a26bcfb4e466d9accc5fa241a7933667) | ||
+++ b/core/src/main/scala/org/apache/spark/util/Utils.scala (revision c8609fa08534e1cc6fd377b3573e5e65ddc79f15) | ||
@@ -3057,7 +3057,12 @@ | ||
0 | ||
} | ||
} | ||
+ | ||
+ def isCelebornEnabled(conf: SparkConf): Boolean = | ||
+ conf.get("spark.shuffle.manager", "sort").contains("celeborn") | ||
} | ||
+ | ||
+ | ||
|
||
private[util] object CallerContext extends Logging { | ||
val callerContextSupported: Boolean = { |
File renamed without changes.
Oops, something went wrong.