Skip to content

Commit

Permalink
Merge pull request #3 from Telefonica/spark35
Browse files Browse the repository at this point in the history
Spark35
  • Loading branch information
pradomota authored Oct 23, 2024
2 parents a3102cf + 9ac264c commit 4eead32
Show file tree
Hide file tree
Showing 834 changed files with 17,807 additions and 4,611 deletions.
61 changes: 43 additions & 18 deletions .github/workflows/build_and_test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -79,25 +79,34 @@ jobs:
id: set-outputs
run: |
if [ -z "${{ inputs.jobs }}" ]; then
pyspark=true; sparkr=true; tpcds=true; docker=true;
pyspark_modules=`cd dev && python -c "import sparktestsupport.modules as m; print(','.join(m.name for m in m.all_modules if m.name.startswith('pyspark')))"`
pyspark=`./dev/is-changed.py -m $pyspark_modules`
sparkr=`./dev/is-changed.py -m sparkr`
tpcds=`./dev/is-changed.py -m sql`
docker=`./dev/is-changed.py -m docker-integration-tests`
# 'build', 'scala-213', and 'java-11-17' are always true for now.
# It does not save significant time and most of PRs trigger the build.
if [[ "${{ github.repository }}" != 'apache/spark' ]]; then
pandas=$pyspark
kubernetes=`./dev/is-changed.py -m kubernetes`
sparkr=`./dev/is-changed.py -m sparkr`
tpcds=`./dev/is-changed.py -m sql`
docker=`./dev/is-changed.py -m docker-integration-tests`
else
pandas=false
kubernetes=false
sparkr=false
tpcds=false
docker=false
fi
build=`./dev/is-changed.py -m "core,unsafe,kvstore,avro,utils,network-common,network-shuffle,repl,launcher,examples,sketch,graphx,catalyst,hive-thriftserver,streaming,sql-kafka-0-10,streaming-kafka-0-10,mllib-local,mllib,yarn,mesos,kubernetes,hadoop-cloud,spark-ganglia-lgpl,sql,hive"`
precondition="
{
\"build\": \"true\",
\"build\": \"$build\",
\"pyspark\": \"$pyspark\",
\"pyspark-pandas\": \"$pandas\",
\"sparkr\": \"$sparkr\",
\"tpcds-1g\": \"$tpcds\",
\"docker-integration-tests\": \"$docker\",
\"scala-213\": \"true\",
\"java-11-17\": \"true\",
\"scala-213\": \"$build\",
\"java-11-17\": \"$build\",
\"lint\" : \"true\",
\"k8s-integration-tests\" : \"true\",
\"k8s-integration-tests\" : \"$kubernetes\",
\"breaking-changes-buf\" : \"true\",
}"
echo $precondition # For debugging
Expand Down Expand Up @@ -204,6 +213,8 @@ jobs:
HIVE_PROFILE: ${{ matrix.hive }}
GITHUB_PREV_SHA: ${{ github.event.before }}
SPARK_LOCAL_IP: localhost
SKIP_UNIDOC: true
SKIP_MIMA: true
SKIP_PACKAGING: true
steps:
- name: Checkout Spark repository
Expand Down Expand Up @@ -256,7 +267,7 @@ jobs:
- name: Install Python packages (Python 3.8)
if: (contains(matrix.modules, 'sql') && !contains(matrix.modules, 'sql-'))
run: |
python3.8 -m pip install 'numpy>=1.20.0' pyarrow pandas scipy unittest-xml-reporting 'grpcio==1.56.0' 'protobuf==3.20.3'
python3.8 -m pip install 'numpy>=1.20.0' 'pyarrow==12.0.1' pandas scipy unittest-xml-reporting 'grpcio==1.56.0' 'protobuf==3.20.3'
python3.8 -m pip list
# Run the tests.
- name: Run tests
Expand Down Expand Up @@ -360,6 +371,14 @@ jobs:
pyspark-pandas-connect
- >-
pyspark-pandas-slow-connect
exclude:
# Always run if pyspark-pandas == 'true', even infra-image is skip (such as non-master job)
# In practice, the build will run in individual PR, but not against the individual commit
# in Apache Spark repository.
- modules: ${{ fromJson(needs.precondition.outputs.required).pyspark-pandas != 'true' && 'pyspark-pandas' }}
- modules: ${{ fromJson(needs.precondition.outputs.required).pyspark-pandas != 'true' && 'pyspark-pandas-slow' }}
- modules: ${{ fromJson(needs.precondition.outputs.required).pyspark-pandas != 'true' && 'pyspark-pandas-connect' }}
- modules: ${{ fromJson(needs.precondition.outputs.required).pyspark-pandas != 'true' && 'pyspark-pandas-slow-connect' }}
env:
MODULES_TO_TEST: ${{ matrix.modules }}
HADOOP_PROFILE: ${{ inputs.hadoop }}
Expand Down Expand Up @@ -407,6 +426,8 @@ jobs:
key: pyspark-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }}
restore-keys: |
pyspark-coursier-
- name: Free up disk space
run: ./dev/free_disk_space_container
- name: Install Java ${{ matrix.java }}
uses: actions/setup-java@v3
with:
Expand Down Expand Up @@ -504,6 +525,8 @@ jobs:
key: sparkr-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }}
restore-keys: |
sparkr-coursier-
- name: Free up disk space
run: ./dev/free_disk_space_container
- name: Install Java ${{ inputs.java }}
uses: actions/setup-java@v3
with:
Expand Down Expand Up @@ -612,6 +635,8 @@ jobs:
key: docs-maven-${{ hashFiles('**/pom.xml') }}
restore-keys: |
docs-maven-
- name: Free up disk space
run: ./dev/free_disk_space_container
- name: Install Java 8
uses: actions/setup-java@v3
with:
Expand All @@ -621,6 +646,8 @@ jobs:
run: ./dev/check-license
- name: Dependencies test
run: ./dev/test-dependencies.sh
- name: MIMA test
run: ./dev/mima
- name: Scala linter
run: ./dev/lint-scala
- name: Java linter
Expand Down Expand Up @@ -672,16 +699,16 @@ jobs:
# See also https://issues.apache.org/jira/browse/SPARK-35375.
# Pin the MarkupSafe to 2.0.1 to resolve the CI error.
# See also https://issues.apache.org/jira/browse/SPARK-38279.
python3.9 -m pip install 'sphinx<3.1.0' mkdocs pydata_sphinx_theme nbsphinx numpydoc 'jinja2<3.0.0' 'markupsafe==2.0.1' 'pyzmq<24.0.0'
python3.9 -m pip install 'sphinx<3.1.0' mkdocs pydata_sphinx_theme 'sphinx-copybutton==0.5.2' 'nbsphinx==0.9.3' numpydoc 'jinja2<3.0.0' 'markupsafe==2.0.1' 'pyzmq<24.0.0' 'sphinxcontrib-applehelp==1.0.4' 'sphinxcontrib-devhelp==1.0.2' 'sphinxcontrib-htmlhelp==2.0.1' 'sphinxcontrib-qthelp==1.0.3' 'sphinxcontrib-serializinghtml==1.1.5' 'nest-asyncio==1.5.8' 'rpds-py==0.16.2' 'alabaster==0.7.13'
python3.9 -m pip install ipython_genutils # See SPARK-38517
python3.9 -m pip install sphinx_plotly_directive 'numpy>=1.20.0' pyarrow pandas 'plotly>=4.8'
python3.9 -m pip install sphinx_plotly_directive 'numpy>=1.20.0' 'pyarrow==12.0.1' pandas 'plotly>=4.8'
python3.9 -m pip install 'docutils<0.18.0' # See SPARK-39421
apt-get update -y
apt-get install -y ruby ruby-dev
Rscript -e "install.packages(c('devtools', 'testthat', 'knitr', 'rmarkdown', 'markdown', 'e1071', 'roxygen2', 'ggplot2', 'mvtnorm', 'statmod'), repos='https://cloud.r-project.org/')"
Rscript -e "devtools::install_version('pkgdown', version='2.0.1', repos='https://cloud.r-project.org')"
Rscript -e "devtools::install_version('preferably', version='0.4', repos='https://cloud.r-project.org')"
gem install bundler
gem install bundler -v 2.4.22
cd docs
bundle install
- name: R linter
Expand Down Expand Up @@ -1010,9 +1037,7 @@ jobs:
- name: start minikube
run: |
# See more in "Installation" https://minikube.sigs.k8s.io/docs/start/
# curl -LO https://storage.googleapis.com/minikube/releases/latest/minikube-linux-amd64
# TODO(SPARK-44495): Resume to use the latest minikube for k8s-integration-tests.
curl -LO https://storage.googleapis.com/minikube/releases/v1.30.1/minikube-linux-amd64
curl -LO https://storage.googleapis.com/minikube/releases/latest/minikube-linux-amd64
sudo install minikube-linux-amd64 /usr/local/bin/minikube
# Github Action limit cpu:2, memory: 6947MB, limit to 2U6G for better resource statistic
minikube start --cpus 2 --memory 6144
Expand All @@ -1030,7 +1055,7 @@ jobs:
kubectl create clusterrolebinding serviceaccounts-cluster-admin --clusterrole=cluster-admin --group=system:serviceaccounts || true
kubectl apply -f https://raw.githubusercontent.com/volcano-sh/volcano/v1.7.0/installer/volcano-development.yaml || true
eval $(minikube docker-env)
build/sbt -Psparkr -Pkubernetes -Pvolcano -Pkubernetes-integration-tests -Dspark.kubernetes.test.driverRequestCores=0.5 -Dspark.kubernetes.test.executorRequestCores=0.2 -Dspark.kubernetes.test.volcanoMaxConcurrencyJobNum=1 -Dtest.exclude.tags=local "kubernetes-integration-tests/test"
build/sbt -Psparkr -Pkubernetes -Pvolcano -Pkubernetes-integration-tests -Dspark.kubernetes.test.volcanoMaxConcurrencyJobNum=1 -Dtest.exclude.tags=local "kubernetes-integration-tests/test"
- name: Upload Spark on K8S integration tests log files
if: failure()
uses: actions/upload-artifact@v3
Expand Down
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
.scala_dependencies
.settings
.vscode
artifacts/
/lib/
R-unit-tests.log
R/unit-tests.out
Expand All @@ -50,6 +51,7 @@ dev/create-release/*final
dev/create-release/*txt
dev/pr-deps/
dist/
docs/_generated/
docs/_site/
docs/api
docs/.local_ruby_bundle
Expand Down
5 changes: 0 additions & 5 deletions LICENSE
Original file line number Diff line number Diff line change
Expand Up @@ -218,11 +218,6 @@ docs/js/vendor/bootstrap.js
connector/spark-ganglia-lgpl/src/main/java/com/codahale/metrics/ganglia/GangliaReporter.java


Python Software Foundation License
----------------------------------

python/docs/source/_static/copybutton.js

BSD 3-Clause
------------

Expand Down
2 changes: 1 addition & 1 deletion R/pkg/DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Package: SparkR
Type: Package
Version: 3.5.0
Version: 3.5.4
Title: R Front End for 'Apache Spark'
Description: Provides an R Front end for 'Apache Spark' <https://spark.apache.org>.
Authors@R:
Expand Down
8 changes: 7 additions & 1 deletion assembly/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
<parent>
<groupId>org.apache.spark</groupId>
<artifactId>spark-parent_2.12</artifactId>
<version>3.5.0</version>
<version>3.5.4-SNAPSHOT</version>
<relativePath>../pom.xml</relativePath>
</parent>

Expand Down Expand Up @@ -159,6 +159,12 @@
<groupId>org.apache.spark</groupId>
<artifactId>spark-connect_${scala.binary.version}</artifactId>
<version>${project.version}</version>
<exclusions>
<exclusion>
<groupId>org.apache.spark</groupId>
<artifactId>spark-connect-common_${scala.binary.version}</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
Expand Down
43 changes: 43 additions & 0 deletions binder/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

FROM python:3.10-slim
# install the notebook package
RUN pip install --no-cache notebook jupyterlab

# create user with a home directory
ARG NB_USER
ARG NB_UID
ENV USER ${NB_USER}
ENV HOME /home/${NB_USER}

RUN adduser --disabled-password \
--gecos "Default user" \
--uid ${NB_UID} \
${NB_USER}
WORKDIR ${HOME}
USER ${USER}

# Make sure the contents of our repo are in ${HOME}
COPY . ${HOME}
USER root
RUN chown -R ${NB_UID} ${HOME}
RUN apt-get update && apt-get install -y openjdk-17-jre git coreutils
USER ${NB_USER}

RUN binder/postBuild

2 changes: 0 additions & 2 deletions binder/apt.txt

This file was deleted.

11 changes: 8 additions & 3 deletions binder/postBuild
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,13 @@
# This file is used for Binder integration to install PySpark available in
# Jupyter notebook.

# SPARK-45706: Should fail fast. Otherwise, the Binder image is successfully
# built, and it cannot be rebuilt.
set -o pipefail
set -e

VERSION=$(python -c "exec(open('python/pyspark/version.py').read()); print(__version__)")
TAG=$(git describe --tags --exact-match 2>/dev/null)
TAG=$(git describe --tags --exact-match 2> /dev/null || true)

# If a commit is tagged, exactly specified version of pyspark should be installed to avoid
# a kind of accident that an old version of pyspark is installed in the live notebook environment.
Expand All @@ -33,9 +38,9 @@ else
fi

if [[ ! $VERSION < "3.4.0" ]]; then
pip install plotly "pandas<2.0.0" "pyspark[sql,ml,mllib,pandas_on_spark,connect]$SPECIFIER$VERSION"
pip install plotly "pandas<2.0.0" "numpy>=1.15,<2" "pyspark[sql,ml,mllib,pandas_on_spark,connect]$SPECIFIER$VERSION"
else
pip install plotly "pandas<2.0.0" "pyspark[sql,ml,mllib,pandas_on_spark]$SPECIFIER$VERSION"
pip install plotly "pandas<2.0.0" "numpy>=1.15,<2" "pyspark[sql,ml,mllib,pandas_on_spark]$SPECIFIER$VERSION"
fi

# Set 'PYARROW_IGNORE_TIMEZONE' to surpress warnings from PyArrow.
Expand Down
7 changes: 6 additions & 1 deletion common/kvstore/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
<parent>
<groupId>org.apache.spark</groupId>
<artifactId>spark-parent_2.12</artifactId>
<version>3.5.0</version>
<version>3.5.4-SNAPSHOT</version>
<relativePath>../../pom.xml</relativePath>
</parent>

Expand Down Expand Up @@ -66,6 +66,11 @@
<artifactId>commons-io</artifactId>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-lang3</artifactId>
<scope>test</scope>
</dependency>

<dependency>
<groupId>org.apache.logging.log4j</groupId>
Expand Down
2 changes: 1 addition & 1 deletion common/network-common/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
<parent>
<groupId>org.apache.spark</groupId>
<artifactId>spark-parent_2.12</artifactId>
<version>3.5.0</version>
<version>3.5.4-SNAPSHOT</version>
<relativePath>../../pom.xml</relativePath>
</parent>

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -325,7 +325,10 @@ public TransportResponseHandler getHandler() {

@Override
public void close() {
// close is a local operation and should finish with milliseconds; timeout just to be safe
// Mark the connection as timed out, so we do not return a connection that's being closed
// from the TransportClientFactory if closing takes some time (e.g. with SSL)
this.timedOut = true;
// close should not take this long; use a timeout just to be safe
channel.close().awaitUninterruptibly(10, TimeUnit.SECONDS);
}

Expand Down
Loading

0 comments on commit 4eead32

Please sign in to comment.