From c10843b21554a52b71c7d8997a902cf716e3c2a6 Mon Sep 17 00:00:00 2001 From: Justin Terry Date: Fri, 26 Jul 2024 14:46:26 -0700 Subject: [PATCH] Save intermediate files --- app/tasks/cog_assets.py | 3 +++ batch/scripts/cogify.sh | 38 ++++++++++++++++++++++---------------- 2 files changed, 25 insertions(+), 16 deletions(-) diff --git a/app/tasks/cog_assets.py b/app/tasks/cog_assets.py index 61a2e742b..6b8e6f694 100644 --- a/app/tasks/cog_assets.py +++ b/app/tasks/cog_assets.py @@ -9,6 +9,7 @@ from app.models.pydantic.change_log import ChangeLog from app.models.pydantic.creation_options import COGCreationOptions from app.models.pydantic.jobs import GDALCOGJob, Job +from app.settings.globals import DATA_LAKE_BUCKET from app.tasks import callback_constructor from app.tasks.batch import execute from app.tasks.raster_tile_set_assets.utils import JOB_ENV @@ -90,6 +91,8 @@ async def create_cogify_job( dataset, "-I", creation_options.implementation, + "--prefix", + f"s3://{DATA_LAKE_BUCKET}/{dataset}/{version}/raster/{srid}/cog", ] if creation_options.export_to_gee: diff --git a/batch/scripts/cogify.sh b/batch/scripts/cogify.sh index d85344e6c..000a730eb 100755 --- a/batch/scripts/cogify.sh +++ b/batch/scripts/cogify.sh @@ -3,44 +3,50 @@ set -e # requires arguments -# -s | --source -# -T | --target # --block_size # -r | --resample # -G | --export_to_gee -# -d | --dataset # -I | --implementation +# -t | --target +# --prefix ME=$(basename "$0") -. get_arguments.sh "$@" +#. get_arguments.sh "$@" set -x # download all GeoTiff files -aws s3 cp --recursive --exclude "*" --include "*.tif" "${SRC}" . -# create VRT of input files so we can use gdal_translate -if [ ! -f "merged.vrt" ]; then +if [[ $(aws s3 ls "${PREFIX}/merged.tif") ]]; then + aws s3 cp "${PREFIX}/merged.tif" merged.tif +else + aws s3 cp --recursive --exclude "*" --include "*.tif" "${SRC}" . + + # create VRT of input files so we can use gdal_translate gdalbuildvrt merged.vrt *.tif -fi -# merge all rasters into one huge raster using COG block size -if [ ! -f "merged.tif" ]; then + # merge all rasters into one huge raster using COG block size gdal_translate -of GTiff -co TILED=YES -co BLOCKXSIZE="${BLOCK_SIZE}" -co BLOCKYSIZE="${BLOCK_SIZE}" -co COMPRESS=DEFLATE -co BIGTIFF=IF_SAFER -co NUM_THREADS=ALL_CPUS --config GDAL_CACHEMAX 70% --config GDAL_NUM_THREADS ALL_CPUS merged.vrt merged.tif + aws s3 cp merged.tif "${PREFIX}/merged.tif" fi -# create overviews in raster -if ! gdalinfo "merged.tif" | grep -q "Overviews"; then - gdaladdo merged.tif -r "${RESAMPLE}" --config GDAL_NUM_THREADS ALL_CPUS --config GDAL_CACHEMAX 70% +if [[ $(aws s3 ls "${PREFIX}/merged.tif.ovr") ]]; then + aws s3 cp "${PREFIX}/merged.tif.ovr" merged.tif.ovr +else + # generate overviews externally + gdaladdo merged.tif -r "${RESAMPLE}" -ro --config GDAL_NUM_THREADS ALL_CPUS --config GDAL_CACHEMAX 70% --config COMPRESS_OVERVIEW DEFLATE + aws s3 cp merged.tif.ovr "${PREFIX}/merged.tif.ovr" fi # convert to COG using existing overviews, this adds some additional layout optimizations -if [ ! -f "cog.tif" ]; then - gdal_translate merged.tif cog.tif -of COG -co COMPRESS=DEFLATE -co BLOCKSIZE="${BLOCK_SIZE}" -co BIGTIFF=IF_SAFER -co NUM_THREADS=ALL_CPUS --config GDAL_CACHEMAX 70% --config GDAL_NUM_THREADS ALL_CPUS -fi +gdal_translate merged.tif cog.tif -of COG -co COMPRESS=DEFLATE -co BLOCKSIZE="${BLOCK_SIZE}" -co BIGTIFF=IF_SAFER -co NUM_THREADS=ALL_CPUS -co OVERVIEWS=FORCE_USE_EXISTING --config GDAL_CACHEMAX 70% --config GDAL_NUM_THREADS ALL_CPUS # upload to data lake aws s3 cp cog.tif "${TARGET}" +# delete intermediate file +aws s3 rm "${PREFIX}/merged.tif" +aws s3 rm "${PREFIX}/merged.tif.ovr" + if [ -n "$EXPORT_TO_GEE" ]; then export_to_gee.py --dataset "${DATASET}" --implementation "${IMPLEMENTATION}" fi