From c10843b21554a52b71c7d8997a902cf716e3c2a6 Mon Sep 17 00:00:00 2001 From: Justin Terry Date: Fri, 26 Jul 2024 14:46:26 -0700 Subject: [PATCH 1/4] Save intermediate files --- app/tasks/cog_assets.py | 3 +++ batch/scripts/cogify.sh | 38 ++++++++++++++++++++++---------------- 2 files changed, 25 insertions(+), 16 deletions(-) diff --git a/app/tasks/cog_assets.py b/app/tasks/cog_assets.py index 61a2e742b..6b8e6f694 100644 --- a/app/tasks/cog_assets.py +++ b/app/tasks/cog_assets.py @@ -9,6 +9,7 @@ from app.models.pydantic.change_log import ChangeLog from app.models.pydantic.creation_options import COGCreationOptions from app.models.pydantic.jobs import GDALCOGJob, Job +from app.settings.globals import DATA_LAKE_BUCKET from app.tasks import callback_constructor from app.tasks.batch import execute from app.tasks.raster_tile_set_assets.utils import JOB_ENV @@ -90,6 +91,8 @@ async def create_cogify_job( dataset, "-I", creation_options.implementation, + "--prefix", + f"s3://{DATA_LAKE_BUCKET}/{dataset}/{version}/raster/{srid}/cog", ] if creation_options.export_to_gee: diff --git a/batch/scripts/cogify.sh b/batch/scripts/cogify.sh index d85344e6c..000a730eb 100755 --- a/batch/scripts/cogify.sh +++ b/batch/scripts/cogify.sh @@ -3,44 +3,50 @@ set -e # requires arguments -# -s | --source -# -T | --target # --block_size # -r | --resample # -G | --export_to_gee -# -d | --dataset # -I | --implementation +# -t | --target +# --prefix ME=$(basename "$0") -. get_arguments.sh "$@" +#. get_arguments.sh "$@" set -x # download all GeoTiff files -aws s3 cp --recursive --exclude "*" --include "*.tif" "${SRC}" . -# create VRT of input files so we can use gdal_translate -if [ ! -f "merged.vrt" ]; then +if [[ $(aws s3 ls "${PREFIX}/merged.tif") ]]; then + aws s3 cp "${PREFIX}/merged.tif" merged.tif +else + aws s3 cp --recursive --exclude "*" --include "*.tif" "${SRC}" . + + # create VRT of input files so we can use gdal_translate gdalbuildvrt merged.vrt *.tif -fi -# merge all rasters into one huge raster using COG block size -if [ ! -f "merged.tif" ]; then + # merge all rasters into one huge raster using COG block size gdal_translate -of GTiff -co TILED=YES -co BLOCKXSIZE="${BLOCK_SIZE}" -co BLOCKYSIZE="${BLOCK_SIZE}" -co COMPRESS=DEFLATE -co BIGTIFF=IF_SAFER -co NUM_THREADS=ALL_CPUS --config GDAL_CACHEMAX 70% --config GDAL_NUM_THREADS ALL_CPUS merged.vrt merged.tif + aws s3 cp merged.tif "${PREFIX}/merged.tif" fi -# create overviews in raster -if ! gdalinfo "merged.tif" | grep -q "Overviews"; then - gdaladdo merged.tif -r "${RESAMPLE}" --config GDAL_NUM_THREADS ALL_CPUS --config GDAL_CACHEMAX 70% +if [[ $(aws s3 ls "${PREFIX}/merged.tif.ovr") ]]; then + aws s3 cp "${PREFIX}/merged.tif.ovr" merged.tif.ovr +else + # generate overviews externally + gdaladdo merged.tif -r "${RESAMPLE}" -ro --config GDAL_NUM_THREADS ALL_CPUS --config GDAL_CACHEMAX 70% --config COMPRESS_OVERVIEW DEFLATE + aws s3 cp merged.tif.ovr "${PREFIX}/merged.tif.ovr" fi # convert to COG using existing overviews, this adds some additional layout optimizations -if [ ! -f "cog.tif" ]; then - gdal_translate merged.tif cog.tif -of COG -co COMPRESS=DEFLATE -co BLOCKSIZE="${BLOCK_SIZE}" -co BIGTIFF=IF_SAFER -co NUM_THREADS=ALL_CPUS --config GDAL_CACHEMAX 70% --config GDAL_NUM_THREADS ALL_CPUS -fi +gdal_translate merged.tif cog.tif -of COG -co COMPRESS=DEFLATE -co BLOCKSIZE="${BLOCK_SIZE}" -co BIGTIFF=IF_SAFER -co NUM_THREADS=ALL_CPUS -co OVERVIEWS=FORCE_USE_EXISTING --config GDAL_CACHEMAX 70% --config GDAL_NUM_THREADS ALL_CPUS # upload to data lake aws s3 cp cog.tif "${TARGET}" +# delete intermediate file +aws s3 rm "${PREFIX}/merged.tif" +aws s3 rm "${PREFIX}/merged.tif.ovr" + if [ -n "$EXPORT_TO_GEE" ]; then export_to_gee.py --dataset "${DATASET}" --implementation "${IMPLEMENTATION}" fi From ce2b2cd3c3a9971c0eb8837870bcb67a002110c1 Mon Sep 17 00:00:00 2001 From: Justin Terry Date: Mon, 29 Jul 2024 10:50:15 -0700 Subject: [PATCH 2/4] Add back get_arguments --- batch/scripts/cogify.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/batch/scripts/cogify.sh b/batch/scripts/cogify.sh index 000a730eb..f923ae821 100755 --- a/batch/scripts/cogify.sh +++ b/batch/scripts/cogify.sh @@ -11,7 +11,7 @@ set -e # --prefix ME=$(basename "$0") -#. get_arguments.sh "$@" +. get_arguments.sh "$@" set -x # download all GeoTiff files From 89c4441b2bbb23072ba6a910b55f23a136ba59d6 Mon Sep 17 00:00:00 2001 From: Justin Terry Date: Tue, 30 Jul 2024 15:20:23 -0700 Subject: [PATCH 3/4] Use implementation name in S3 intermediates --- batch/scripts/cogify.sh | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/batch/scripts/cogify.sh b/batch/scripts/cogify.sh index f923ae821..96893567d 100755 --- a/batch/scripts/cogify.sh +++ b/batch/scripts/cogify.sh @@ -16,32 +16,32 @@ ME=$(basename "$0") set -x # download all GeoTiff files -if [[ $(aws s3 ls "${PREFIX}/merged.tif") ]]; then - aws s3 cp "${PREFIX}/merged.tif" merged.tif +if [[ $(aws s3 ls "${PREFIX}/${IMPLEMENTATION}_merged.tif") ]]; then + aws s3 cp "${PREFIX}/${IMPLEMENTATION}_merged.tif" "${IMPLEMENTATION}_merged.tif" else aws s3 cp --recursive --exclude "*" --include "*.tif" "${SRC}" . # create VRT of input files so we can use gdal_translate - gdalbuildvrt merged.vrt *.tif + gdalbuildvrt "${IMPLEMENTATION}_merged.vrt" *.tif # merge all rasters into one huge raster using COG block size - gdal_translate -of GTiff -co TILED=YES -co BLOCKXSIZE="${BLOCK_SIZE}" -co BLOCKYSIZE="${BLOCK_SIZE}" -co COMPRESS=DEFLATE -co BIGTIFF=IF_SAFER -co NUM_THREADS=ALL_CPUS --config GDAL_CACHEMAX 70% --config GDAL_NUM_THREADS ALL_CPUS merged.vrt merged.tif - aws s3 cp merged.tif "${PREFIX}/merged.tif" + gdal_translate -of GTiff -co TILED=YES -co BLOCKXSIZE="${BLOCK_SIZE}" -co BLOCKYSIZE="${BLOCK_SIZE}" -co COMPRESS=DEFLATE -co BIGTIFF=IF_SAFER -co NUM_THREADS=ALL_CPUS --config GDAL_CACHEMAX 70% --config GDAL_NUM_THREADS ALL_CPUS "${IMPLEMENTATION}_merged.vrt" "${IMPLEMENTATION}_merged.tif" + aws s3 cp merged.tif "${PREFIX}/${IMPLEMENTATION}_merged.tif" fi -if [[ $(aws s3 ls "${PREFIX}/merged.tif.ovr") ]]; then - aws s3 cp "${PREFIX}/merged.tif.ovr" merged.tif.ovr +if [[ $(aws s3 ls "${PREFIX}/${IMPLEMENTATION}_merged.tif.ovr") ]]; then + aws s3 cp "${PREFIX}/${IMPLEMENTATION}_merged.tif.ovr" "${IMPLEMENTATION}_merged.tif.ovr" else # generate overviews externally - gdaladdo merged.tif -r "${RESAMPLE}" -ro --config GDAL_NUM_THREADS ALL_CPUS --config GDAL_CACHEMAX 70% --config COMPRESS_OVERVIEW DEFLATE - aws s3 cp merged.tif.ovr "${PREFIX}/merged.tif.ovr" + gdaladdo "${IMPLEMENTATION}_merged.tif" -r "${RESAMPLE}" -ro --config GDAL_NUM_THREADS ALL_CPUS --config GDAL_CACHEMAX 70% --config COMPRESS_OVERVIEW DEFLATE + aws s3 cp merged.tif.ovr "${PREFIX}/${IMPLEMENTATION}_merged.tif.ovr" fi # convert to COG using existing overviews, this adds some additional layout optimizations -gdal_translate merged.tif cog.tif -of COG -co COMPRESS=DEFLATE -co BLOCKSIZE="${BLOCK_SIZE}" -co BIGTIFF=IF_SAFER -co NUM_THREADS=ALL_CPUS -co OVERVIEWS=FORCE_USE_EXISTING --config GDAL_CACHEMAX 70% --config GDAL_NUM_THREADS ALL_CPUS +gdal_translate "${IMPLEMENTATION}_merged.tif" "${IMPLEMENTATION}.tif" -of COG -co COMPRESS=DEFLATE -co BLOCKSIZE="${BLOCK_SIZE}" -co BIGTIFF=IF_SAFER -co NUM_THREADS=ALL_CPUS -co OVERVIEWS=FORCE_USE_EXISTING --config GDAL_CACHEMAX 70% --config GDAL_NUM_THREADS ALL_CPUS # upload to data lake -aws s3 cp cog.tif "${TARGET}" +aws s3 cp "${IMPLEMENTATION}.tif" "${TARGET}" # delete intermediate file aws s3 rm "${PREFIX}/merged.tif" From 4b83ab9b4f36fd56c5d45a9e679ed7be6247cc1f Mon Sep 17 00:00:00 2001 From: Justin Terry Date: Wed, 31 Jul 2024 11:33:45 -0700 Subject: [PATCH 4/4] Connect with raster analysis step function --- batch/scripts/cogify.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/batch/scripts/cogify.sh b/batch/scripts/cogify.sh index 96893567d..3fb2d40b1 100755 --- a/batch/scripts/cogify.sh +++ b/batch/scripts/cogify.sh @@ -26,7 +26,7 @@ else # merge all rasters into one huge raster using COG block size gdal_translate -of GTiff -co TILED=YES -co BLOCKXSIZE="${BLOCK_SIZE}" -co BLOCKYSIZE="${BLOCK_SIZE}" -co COMPRESS=DEFLATE -co BIGTIFF=IF_SAFER -co NUM_THREADS=ALL_CPUS --config GDAL_CACHEMAX 70% --config GDAL_NUM_THREADS ALL_CPUS "${IMPLEMENTATION}_merged.vrt" "${IMPLEMENTATION}_merged.tif" - aws s3 cp merged.tif "${PREFIX}/${IMPLEMENTATION}_merged.tif" + aws s3 cp "${IMPLEMENTATION}_merged.tif" "${PREFIX}/${IMPLEMENTATION}_merged.tif" fi if [[ $(aws s3 ls "${PREFIX}/${IMPLEMENTATION}_merged.tif.ovr") ]]; then @@ -34,7 +34,7 @@ if [[ $(aws s3 ls "${PREFIX}/${IMPLEMENTATION}_merged.tif.ovr") ]]; then else # generate overviews externally gdaladdo "${IMPLEMENTATION}_merged.tif" -r "${RESAMPLE}" -ro --config GDAL_NUM_THREADS ALL_CPUS --config GDAL_CACHEMAX 70% --config COMPRESS_OVERVIEW DEFLATE - aws s3 cp merged.tif.ovr "${PREFIX}/${IMPLEMENTATION}_merged.tif.ovr" + aws s3 cp "${IMPLEMENTATION}_merged.tif.ovr" "${PREFIX}/${IMPLEMENTATION}_merged.tif.ovr" fi # convert to COG using existing overviews, this adds some additional layout optimizations @@ -44,8 +44,8 @@ gdal_translate "${IMPLEMENTATION}_merged.tif" "${IMPLEMENTATION}.tif" -of COG -c aws s3 cp "${IMPLEMENTATION}.tif" "${TARGET}" # delete intermediate file -aws s3 rm "${PREFIX}/merged.tif" -aws s3 rm "${PREFIX}/merged.tif.ovr" +aws s3 rm "${PREFIX}/${IMPLEMENTATION}_merged.tif" +aws s3 rm "${PREFIX}/${IMPLEMENTATION}_merged.tif.ovr" if [ -n "$EXPORT_TO_GEE" ]; then export_to_gee.py --dataset "${DATASET}" --implementation "${IMPLEMENTATION}"