From e47df79a61813abdd3c68bc4244bec2144ecf997 Mon Sep 17 00:00:00 2001 From: Dan Scales Date: Tue, 13 Feb 2024 12:03:34 -0800 Subject: [PATCH 01/44] GTC-2706 Give error code and message when asset UID is wrong If the asset uid when creating a raster tile cache is wrong (bad format or nonexistent), return a better status code and error message, rather than just a 500 (internal system error) Added new test in test_assets.py for error cases when the source asset uid does not exist. I factored out a new function for creating the default asset to reduce a bunch of duplicated code. Also, the test test_raster_tile_cache_asset had been completely disabled for quite a while. I re-enabled the first 4 sub-tests, which work fine. The fifth sub-test does not work, so I left it disabled and filed issue GTC-2735. --- .../raster_tile_cache_assets.py | 14 +- tests/routes/datasets/test_assets.py | 287 +++++++----------- 2 files changed, 121 insertions(+), 180 deletions(-) diff --git a/app/tasks/raster_tile_cache_assets/raster_tile_cache_assets.py b/app/tasks/raster_tile_cache_assets/raster_tile_cache_assets.py index eb9759721..93bf63e0e 100644 --- a/app/tasks/raster_tile_cache_assets/raster_tile_cache_assets.py +++ b/app/tasks/raster_tile_cache_assets/raster_tile_cache_assets.py @@ -4,6 +4,7 @@ import numpy as np from fastapi import HTTPException from fastapi.logger import logger +from asyncpg import DataError from app.crud.assets import get_asset from app.models.enum.assets import AssetType @@ -29,6 +30,7 @@ ) from app.utils.path import get_asset_uri, tile_uri_to_tiles_geojson +from ...errors import RecordNotFoundError async def raster_tile_cache_asset( dataset: str, @@ -204,9 +206,15 @@ async def raster_tile_cache_validator( Used in asset route. If validation fails, it will raise an HTTPException visible to user. """ - source_asset: ORMAsset = await get_asset( - input_data["creation_options"]["source_asset_id"] - ) + try: + source_asset: ORMAsset = await get_asset( + input_data["creation_options"]["source_asset_id"] + ) + except RecordNotFoundError as e: + raise HTTPException(status_code=404, detail=str(e)) + except DataError as e: + raise HTTPException(status_code=400, detail=str(e)) + if (source_asset.dataset != dataset) or (source_asset.version != version): message: str = ( "Dataset and version of source asset must match dataset and " diff --git a/tests/routes/datasets/test_assets.py b/tests/routes/datasets/test_assets.py index eb45f43c8..2a407c1b5 100644 --- a/tests/routes/datasets/test_assets.py +++ b/tests/routes/datasets/test_assets.py @@ -33,6 +33,49 @@ s3_client = get_s3_client() +# Create a first asset of a new version with a raster tile set and specified grid and +# pixel meaning. extra_opts specifies any extra options for the creation_options. +async def create_test_default_asset(dataset, version, primary_grid, pixel_meaning, extra_opts, async_client, logs, httpd): + raster_version_payload = { + "creation_options": { + "source_type": "raster", + "source_uri": [ + f"s3://{DATA_LAKE_BUCKET}/{FAKE_INT_DATA_PARAMS['prefix']}/tiles.geojson" + ], + "source_driver": "GeoTIFF", + "data_type": FAKE_INT_DATA_PARAMS["dtype_name"], + "no_data": FAKE_INT_DATA_PARAMS["no_data"], + "pixel_meaning": pixel_meaning, + "grid": primary_grid, + "resampling": "nearest", + "overwrite": True, + }, + } + if extra_opts != None: + raster_version_payload["creation_options"] |= extra_opts + + asset = await create_default_asset( + dataset, + version, + version_payload=raster_version_payload, + async_client=async_client, + execute_batch_jobs=True, + ) + default_asset_id = asset["asset_id"] + + await check_tasks_status(async_client, logs, [default_asset_id]) + + # Verify that the asset and version are in state "saved" + version_resp = await async_client.get(f"/dataset/{dataset}/{version}") + assert version_resp.json()["data"]["status"] == "saved" + + asset_resp = await async_client.get(f"/asset/{default_asset_id}") + assert asset_resp.json()["data"]["status"] == "saved" + + # Flush requests list so we're starting fresh + httpx.delete(f"http://localhost:{httpd.server_port}") + return default_asset_id + @pytest.mark.asyncio async def test_assets(async_client): @@ -181,40 +224,8 @@ async def test_auxiliary_raster_asset(async_client, httpd, logs): for key in pixetl_output_files: s3_client.delete_object(Bucket=DATA_LAKE_BUCKET, Key=key) - raster_version_payload = { - "creation_options": { - "source_type": "raster", - "source_uri": [ - f"s3://{DATA_LAKE_BUCKET}/{FAKE_INT_DATA_PARAMS['prefix']}/tiles.geojson" - ], - "source_driver": "GeoTIFF", - "data_type": FAKE_INT_DATA_PARAMS["dtype_name"], - "no_data": FAKE_INT_DATA_PARAMS["no_data"], - "pixel_meaning": "percent", - "grid": primary_grid, - "resampling": "nearest", - "overwrite": True, - "subset": "90N_000E", - } - } - asset = await create_default_asset( - dataset, - version, - version_payload=raster_version_payload, - async_client=async_client, - execute_batch_jobs=True, - ) - asset_id = asset["asset_id"] - - # Verify that the asset and version are in state "saved" - version_resp = await async_client.get(f"/dataset/{dataset}/{version}") - assert version_resp.json()["data"]["status"] == "saved" - - asset_resp = await async_client.get(f"/asset/{asset_id}") - assert asset_resp.json()["data"]["status"] == "saved" - - # Flush requests list so we're starting fresh - httpx.delete(f"http://localhost:{httpd.server_port}") + await create_test_default_asset(dataset, version, primary_grid, "percent", + { "subset": "90N_000E" }, async_client, logs, httpd) # Try adding a non-default raster tile asset based on the default asset_payload = { @@ -451,8 +462,8 @@ async def test_asset_bad_requests(async_client, batch_client, httpd): ] -@pytest.mark.skip("Disabling for a few days while replacements are made") -@pytest.mark.parametrize("checks", symbology_checks) +# The 5th case in symbology_checks[] is not currently working (see GTC-2735). +@pytest.mark.parametrize("checks", symbology_checks[:4]) @pytest.mark.asyncio async def test_raster_tile_cache_asset(checks, async_client, batch_client, httpd): """""" @@ -462,44 +473,9 @@ async def test_raster_tile_cache_asset(checks, async_client, batch_client, httpd dataset = "test_raster_tile_cache_asset" version = "v1.0.0" primary_grid = "90/27008" - pixel_meaning = "date_conf" - raster_version_payload = { - "creation_options": { - "source_type": "raster", - "source_uri": [ - f"s3://{DATA_LAKE_BUCKET}/{FAKE_INT_DATA_PARAMS['prefix']}/tiles.geojson" - ], - "source_driver": "GeoTIFF", - "data_type": FAKE_INT_DATA_PARAMS["dtype_name"], - "no_data": FAKE_INT_DATA_PARAMS["no_data"], - "pixel_meaning": pixel_meaning, - "grid": primary_grid, - "resampling": "nearest", - "overwrite": True, - }, - } - asset = await create_default_asset( - dataset, - version, - version_payload=raster_version_payload, - async_client=async_client, - execute_batch_jobs=True, - ) - default_asset_id = asset["asset_id"] - - await check_tasks_status(async_client, logs, [default_asset_id]) - - # Verify that the asset and version are in state "saved" - version_resp = await async_client.get(f"/dataset/{dataset}/{version}") - assert version_resp.json()["data"]["status"] == "saved" - - asset_resp = await async_client.get(f"/asset/{default_asset_id}") - assert asset_resp.json()["data"]["status"] == "saved" - - # Flush requests list so we're starting fresh - httpx.delete(f"http://localhost:{httpd.server_port}") + default_asset_id = await create_test_default_asset(dataset, version, primary_grid, pixel_meaning, None, async_client, logs, httpd) await _test_raster_tile_cache( dataset, @@ -632,7 +608,7 @@ async def _test_raster_tile_cache( @pytest.mark.hanging @pytest.mark.asyncio -async def test_asset_stats(async_client): +async def test_asset_stats(async_client, logs, httpd): dataset = "test_asset_stats" version = "v1.0.0" @@ -641,31 +617,9 @@ async def test_asset_stats(async_client): ) delete_s3_files(DATA_LAKE_BUCKET, pixetl_output_files_prefix) - raster_version_payload = { - "creation_options": { - "source_type": "raster", - "source_uri": [ - f"s3://{DATA_LAKE_BUCKET}/{FAKE_INT_DATA_PARAMS['prefix']}/tiles.geojson" - ], - "source_driver": "GeoTIFF", - "data_type": FAKE_INT_DATA_PARAMS["dtype_name"], - "no_data": FAKE_INT_DATA_PARAMS["no_data"], - "pixel_meaning": "percent", - "grid": "90/27008", - "resampling": "nearest", - "overwrite": True, - "compute_histogram": True, - "compute_stats": True, - }, - } - - await create_default_asset( - dataset, - version, - version_payload=raster_version_payload, - async_client=async_client, - execute_batch_jobs=True, - ) + await create_test_default_asset(dataset, version, "90/27008", "percent", + { "compute_histogram": True, "compute_stats": True }, + async_client, logs, httpd) resp = await async_client.get(f"/dataset/{dataset}/{version}/assets") asset_id = resp.json()["data"][0]["asset_id"] @@ -683,7 +637,7 @@ async def test_asset_stats(async_client): @pytest.mark.hanging @pytest.mark.asyncio -async def test_asset_stats_no_histo(async_client): +async def test_asset_stats_no_histo(async_client, logs, httpd): dataset = "test_asset_stats_no_histo" version = "v1.0.0" @@ -692,31 +646,9 @@ async def test_asset_stats_no_histo(async_client): ) delete_s3_files(DATA_LAKE_BUCKET, pixetl_output_files_prefix) - raster_version_payload = { - "creation_options": { - "source_type": "raster", - "source_uri": [ - f"s3://{DATA_LAKE_BUCKET}/{FAKE_INT_DATA_PARAMS['prefix']}/tiles.geojson" - ], - "source_driver": "GeoTIFF", - "data_type": FAKE_INT_DATA_PARAMS["dtype_name"], - "no_data": FAKE_INT_DATA_PARAMS["no_data"], - "pixel_meaning": "percent", - "grid": "90/27008", - "resampling": "nearest", - "overwrite": True, - "compute_histogram": False, - "compute_stats": True, - }, - } - - await create_default_asset( - dataset, - version, - version_payload=raster_version_payload, - async_client=async_client, - execute_batch_jobs=True, - ) + await create_test_default_asset(dataset, version, "90/27008", "percent", + { "compute_histogram": False, "compute_stats": True }, + async_client, logs, httpd) resp = await async_client.get(f"/dataset/{dataset}/{version}/assets") asset_id = resp.json()["data"][0]["asset_id"] @@ -730,7 +662,7 @@ async def test_asset_stats_no_histo(async_client): @pytest.mark.hanging @pytest.mark.asyncio -async def test_asset_extent(async_client): +async def test_asset_extent(async_client, logs, httpd): dataset = "test_asset_extent" version = "v1.0.0" @@ -739,32 +671,9 @@ async def test_asset_extent(async_client): ) delete_s3_files(DATA_LAKE_BUCKET, pixetl_output_files_prefix) - raster_version_payload = { - "creation_options": { - "source_type": "raster", - "source_uri": [ - f"s3://{DATA_LAKE_BUCKET}/{FAKE_INT_DATA_PARAMS['prefix']}/tiles.geojson" - ], - "source_driver": "GeoTIFF", - "data_type": FAKE_INT_DATA_PARAMS["dtype_name"], - "no_data": FAKE_INT_DATA_PARAMS["no_data"], - "pixel_meaning": "percent", - "grid": "90/27008", - "resampling": "nearest", - "overwrite": True, - "compute_histogram": False, - "compute_stats": False, - }, - } - - await create_default_asset( - dataset, - version, - version_payload=raster_version_payload, - async_client=async_client, - execute_batch_jobs=True, - ) - + await create_test_default_asset(dataset, version, "90/27008", "percent", + { "compute_histogram": False, "compute_stats": False }, + async_client, logs, httpd) expected_coords = [ [[0.0, 90.0], [90.0, 90.0], [90.0, 0.0], [0.0, 0.0], [0.0, 90.0]] ] @@ -785,7 +694,7 @@ async def test_asset_extent(async_client): @pytest.mark.hanging @pytest.mark.asyncio -async def test_asset_extent_stats_empty(async_client): +async def test_asset_extent_stats_empty(async_client, logs, httpd): dataset = "test_asset_extent_stats_empty" version = "v1.0.0" @@ -794,31 +703,9 @@ async def test_asset_extent_stats_empty(async_client): ) delete_s3_files(DATA_LAKE_BUCKET, pixetl_output_files_prefix) - raster_version_payload = { - "creation_options": { - "source_type": "raster", - "source_uri": [ - f"s3://{DATA_LAKE_BUCKET}/{FAKE_INT_DATA_PARAMS['prefix']}/tiles.geojson" - ], - "source_driver": "GeoTIFF", - "data_type": FAKE_INT_DATA_PARAMS["dtype_name"], - "no_data": FAKE_INT_DATA_PARAMS["no_data"], - "pixel_meaning": "percent", - "grid": "90/27008", - "resampling": "nearest", - "overwrite": True, - "compute_histogram": False, - "compute_stats": False, - }, - } - - await create_default_asset( - dataset, - version, - version_payload=raster_version_payload, - async_client=async_client, - execute_batch_jobs=True, - ) + await create_test_default_asset(dataset, version, "90/27008", "percent", + { "compute_histogram": False, "compute_stats": False }, + async_client, logs, httpd) resp = await async_client.get(f"/dataset/{dataset}/{version}/assets") asset_id = resp.json()["data"][0]["asset_id"] @@ -1089,3 +976,49 @@ async def test_raster_asset_payloads_vector_source(async_client): ) resp_json = create_asset_resp.json() assert resp_json["status"] == "success" + + +asset_errors = [ + ( "999", 400 ), + ( "12345678-1234-1234-1234-123456789abc", 404 ) +] + +@pytest.mark.parametrize("asset_error", asset_errors) +@pytest.mark.asyncio +async def test_raster_tile_cache_nonexistent_asset(asset_error, async_client, batch_client, httpd): + """Test error cases where the source_asset_id doesn't exist""" + _, logs = batch_client + + # Add a dataset, version, and default (raster tile set) asset + dataset = "test_raster_tile_cache_asset" + version = "v1.0.0" + primary_grid = "90/27008" + pixel_meaning = "date_conf" + + await create_test_default_asset(dataset, version, primary_grid, pixel_meaning, None, async_client, logs, httpd) + + default_asset_id = asset_error[0] + symbology = symbology_checks[0]["symbology"] + asset_payload = { + "asset_type": "Raster tile cache", + "is_managed": True, + "creation_options": { + "source_asset_id": default_asset_id, + "min_zoom": 0, + "max_zoom": 2, + "max_static_zoom": 1, + "symbology": symbology, + "implementation": symbology["type"], + }, + "metadata": asset_metadata, + } + + create_asset_resp = await async_client.post( + f"/dataset/{dataset}/{version}/assets", json=asset_payload + ) + resp_json = create_asset_resp.json() + # + print(f"CREATE TILE CACHE ASSET RESPONSE: {resp_json}") + assert resp_json["status"] == "failed" + assert create_asset_resp.status_code == asset_error[1] + From 445b5919235c93dc8af7dfc29aa7dc9605a4a05b Mon Sep 17 00:00:00 2001 From: Solomon Negusse Date: Mon, 26 Feb 2024 09:58:24 +0300 Subject: [PATCH 02/44] have dev branches share load balancer and api gateway stage --- terraform/api_gateway.tf | 337 ---------------------------- terraform/cloudfront.tf | 97 ++++---- terraform/data.tf | 13 +- terraform/main.tf | 48 ++-- terraform/outputs.tf | 8 +- terraform/variables.tf | 96 +++++--- terraform/vars/terraform-dev.tfvars | 28 ++- 7 files changed, 184 insertions(+), 443 deletions(-) delete mode 100644 terraform/api_gateway.tf diff --git a/terraform/api_gateway.tf b/terraform/api_gateway.tf deleted file mode 100644 index ab2f67df4..000000000 --- a/terraform/api_gateway.tf +++ /dev/null @@ -1,337 +0,0 @@ -resource "aws_api_gateway_rest_api" "api_gw_api" { - name = substr("GFWDataAPIGateway${local.name_suffix}", 0, 64) - description = "GFW Data API Gateway" - api_key_source = "AUTHORIZER" # pragma: allowlist secret - - endpoint_configuration { - types = ["REGIONAL"] - } -} - -resource "aws_api_gateway_resource" "dataset_parent" { - rest_api_id = aws_api_gateway_rest_api.api_gw_api.id - parent_id = aws_api_gateway_rest_api.api_gw_api.root_resource_id - path_part = "dataset" -} - -resource "aws_api_gateway_resource" "dataset" { - rest_api_id = aws_api_gateway_rest_api.api_gw_api.id - parent_id = aws_api_gateway_resource.dataset_parent.id - path_part = "{dataset}" -} - -resource "aws_api_gateway_resource" "version" { - rest_api_id = aws_api_gateway_rest_api.api_gw_api.id - parent_id = aws_api_gateway_resource.dataset.id - path_part = "{version}" -} - -resource "aws_api_gateway_resource" "query_parent" { - rest_api_id = aws_api_gateway_rest_api.api_gw_api.id - parent_id = aws_api_gateway_resource.version.id - path_part = "query" -} - -module "query_resource" { - source = "./modules/api_gateway/resource" - rest_api_id = aws_api_gateway_rest_api.api_gw_api.id - parent_id = aws_api_gateway_resource.query_parent.id - path_part = "{proxy+}" -} - -module "query_get" { - source = "./modules/api_gateway/endpoint" - - rest_api_id = aws_api_gateway_rest_api.api_gw_api.id - authorizer_id = aws_api_gateway_authorizer.api_key.id - api_resource = module.query_resource.aws_api_gateway_resource - - require_api_key = false - http_method = "GET" - authorization = "NONE" - - integration_parameters = { - "integration.request.path.version" = "method.request.path.version" - "integration.request.path.dataset" = "method.request.path.dataset", - "integration.request.path.proxy" = "method.request.path.proxy" - } - - method_parameters = { - "method.request.path.dataset" = true, - "method.request.path.version" = true - "method.request.path.proxy" = true - - } - - integration_uri = "http://${module.fargate_autoscaling.lb_dns_name}/dataset/{dataset}/{version}/query/{proxy}" -} - -module "query_post" { - source = "./modules/api_gateway/endpoint" - - rest_api_id = aws_api_gateway_rest_api.api_gw_api.id - authorizer_id = aws_api_gateway_authorizer.api_key.id - api_resource = module.query_resource.aws_api_gateway_resource - - require_api_key = false - http_method = "POST" - authorization = "NONE" - - integration_parameters = { - "integration.request.path.version" = "method.request.path.version" - "integration.request.path.dataset" = "method.request.path.dataset", - "integration.request.path.proxy" = "method.request.path.proxy" - } - - method_parameters = { - "method.request.path.dataset" = true, - "method.request.path.version" = true - "method.request.path.proxy" = true - - } - - integration_uri = "http://${module.fargate_autoscaling.lb_dns_name}/dataset/{dataset}/{version}/query/{proxy}" -} - -resource "aws_api_gateway_resource" "download_parent" { - rest_api_id = aws_api_gateway_rest_api.api_gw_api.id - parent_id = aws_api_gateway_resource.version.id - path_part = "download" -} - -module "download_shapes_resources" { - source = "./modules/api_gateway/resource" - - rest_api_id = aws_api_gateway_rest_api.api_gw_api.id - parent_id = aws_api_gateway_resource.download_parent.id - - for_each = toset(var.download_endpoints) - path_part = each.key -} - -module "download_shapes_endpoint" { - source = "./modules/api_gateway/endpoint" - - rest_api_id = aws_api_gateway_rest_api.api_gw_api.id - authorizer_id = aws_api_gateway_authorizer.api_key.id - - for_each = module.download_shapes_resources - api_resource = each.value.aws_api_gateway_resource - - require_api_key = true - http_method = "GET" - authorization = "CUSTOM" - - integration_parameters = { - "integration.request.path.dataset" = "method.request.path.dataset", - "integration.request.path.version" = "method.request.path.version" - } - - method_parameters = { - "method.request.path.dataset" = true, - "method.request.path.version" = true - } - - integration_uri = "http://${module.fargate_autoscaling.lb_dns_name}/dataset/{dataset}/{version}/download/${each.key}" -} - -module unprotected_resource { - source = "./modules/api_gateway/resource" - - rest_api_id = aws_api_gateway_rest_api.api_gw_api.id - parent_id = aws_api_gateway_rest_api.api_gw_api.root_resource_id - path_part = "{proxy+}" - -} - -module "unprotected_endpoints" { - source = "./modules/api_gateway/endpoint" - - rest_api_id = aws_api_gateway_rest_api.api_gw_api.id - authorizer_id = aws_api_gateway_authorizer.api_key.id - api_resource = module.unprotected_resource.aws_api_gateway_resource - - - require_api_key = false - http_method = "ANY" - authorization = "NONE" - - method_parameters = { "method.request.path.proxy" = true } - integration_parameters = { "integration.request.path.proxy" = "method.request.path.proxy" } - - integration_uri = "http://${module.fargate_autoscaling.lb_dns_name}/{proxy}" -} - - -resource "aws_api_gateway_usage_plan" "internal" { - name = substr("internal_apps${local.name_suffix}", 0, 64) - - api_stages { - api_id = aws_api_gateway_rest_api.api_gw_api.id - stage = aws_api_gateway_stage.api_gw_stage.stage_name - } - - quota_settings { - limit = var.api_gateway_usage_plans.internal_apps.quota_limit - period = "DAY" - } - - throttle_settings { - burst_limit = var.api_gateway_usage_plans.internal_apps.burst_limit - rate_limit = var.api_gateway_usage_plans.internal_apps.rate_limit - } - - # terraform doesn't expose API Gateway's method level throttling so will do that - # manually and this will stop terraform from destroying the manual changes - # Open PR to add the feature to terraform: https://github.com/hashicorp/terraform-provider-aws/pull/20672 - lifecycle { - ignore_changes = all - } -} - -resource "aws_api_gateway_usage_plan" "external" { - name = substr("external_apps${local.name_suffix}", 0, 64) - - api_stages { - api_id = aws_api_gateway_rest_api.api_gw_api.id - stage = aws_api_gateway_stage.api_gw_stage.stage_name - } - - quota_settings { - limit = var.api_gateway_usage_plans.external_apps.quota_limit - period = "DAY" - } - - throttle_settings { - burst_limit = var.api_gateway_usage_plans.external_apps.burst_limit - rate_limit = var.api_gateway_usage_plans.external_apps.rate_limit - } - - # terraform doesn't expose API Gateway's method level throttling so will do that - # manually and this will stop terraform from destroying the manual changes - # Open PR to add the feature to terraform: https://github.com/hashicorp/terraform-provider-aws/pull/20672 - lifecycle { - ignore_changes = all - } - -} - -resource "aws_api_gateway_deployment" "api_gw_dep" { - rest_api_id = aws_api_gateway_rest_api.api_gw_api.id - - triggers = { - redeployment = "${md5(file("api_gateway.tf"))}-${md5(file("./modules/api_gateway/endpoint/main.tf"))}-${md5(file("./modules/api_gateway/resource/main.tf"))}" - } - - depends_on = [ - module.query_get.integration_point, - module.query_post.integration_point, - #FIXME don't hardcode the spatial integration points - module.download_shapes_endpoint["shp"].integration_point, - module.download_shapes_endpoint["gpkg"].integration_point, - module.download_shapes_endpoint["geotiff"].integration_point, - module.unprotected_endpoints.integration_point - ] - - lifecycle { - create_before_destroy = true - } -} - -resource "aws_api_gateway_stage" "api_gw_stage" { - deployment_id = aws_api_gateway_deployment.api_gw_dep.id - rest_api_id = aws_api_gateway_rest_api.api_gw_api.id - stage_name = local.api_gw_stage_name -} - -# Lambda Authorizer -resource "aws_api_gateway_authorizer" "api_key" { - name = "api_key" - rest_api_id = aws_api_gateway_rest_api.api_gw_api.id - type = "REQUEST" - authorizer_uri = aws_lambda_function.authorizer.invoke_arn - authorizer_credentials = aws_iam_role.invocation_role.arn - authorizer_result_ttl_in_seconds = 0 - - # making sure terraform doesn't require default authorization - # header (https://github.com/hashicorp/terraform-provider-aws/issues/5845) - identity_source = "," -} - - -resource "aws_iam_role" "invocation_role" { - name = substr("api_gateway_auth_invocation${local.name_suffix}", 0, 64) - path = "/" - - assume_role_policy = data.template_file.api_gateway_role_policy.rendered -} - -resource "aws_iam_role_policy" "invocation_policy" { - name = "default" - role = aws_iam_role.invocation_role.id - - policy = data.local_file.iam_lambda_invoke.content -} - -resource "aws_iam_role" "lambda" { - name = substr("api_gw_authorizer_lambda${local.name_suffix}", 0, 64) - - assume_role_policy = data.template_file.lambda_role_policy.rendered -} - -resource "aws_lambda_function" "authorizer" { - filename = "api_gateway/api_key_authorizer_lambda.zip" - function_name = substr("api_gateway_authorizer${local.name_suffix}", 0, 64) - runtime = "python3.8" - role = aws_iam_role.lambda.arn - handler = "lambda_function.handler" - - source_code_hash = filebase64sha256("api_gateway/api_key_authorizer_lambda.zip") - - depends_on = [ - aws_iam_role.cloudwatch - ] -} - - -# Cloudwatch Logging -resource "aws_api_gateway_account" "main" { - cloudwatch_role_arn = aws_iam_role.cloudwatch.arn -} - -resource "aws_iam_role" "cloudwatch" { - name = substr("api_gateway_cloudwatch_global${local.name_suffix}", 0, 64) - - assume_role_policy = data.template_file.api_gateway_role_policy.rendered -} - -resource "aws_iam_role_policy" "api_gw_cloudwatch" { - name = "default" - role = aws_iam_role.cloudwatch.id - - policy = data.local_file.cloudwatch_log_policy.content -} - -resource "aws_iam_role_policy" "lambda_cloudwatch" { - name = "default" - role = aws_iam_role.lambda.id - - policy = data.local_file.cloudwatch_log_policy.content -} - -resource "aws_api_gateway_method_settings" "general_settings" { - rest_api_id = aws_api_gateway_rest_api.api_gw_api.id - stage_name = aws_api_gateway_stage.api_gw_stage.stage_name - method_path = "*/*" - - settings { - # Enable CloudWatch logging and metrics - metrics_enabled = true - data_trace_enabled = true - logging_level = "INFO" - } - - depends_on = [ - aws_iam_role.cloudwatch - ] -} diff --git a/terraform/cloudfront.tf b/terraform/cloudfront.tf index b6f49c301..278054ba1 100644 --- a/terraform/cloudfront.tf +++ b/terraform/cloudfront.tf @@ -1,17 +1,18 @@ resource "aws_cloudfront_distribution" "data_api" { - enabled = true - is_ipv6_enabled = true - price_class = "PriceClass_All" - aliases = var.environment == "dev" ? null : [replace(var.service_url, "https://", "")] + count = var.create_cloudfront_distribution == true ? 1 : 0 + enabled = true + is_ipv6_enabled = true + price_class = "PriceClass_All" + aliases = var.environment == "dev" ? null : [replace(var.service_url, "https://", "")] - origin { - domain_name = module.fargate_autoscaling.lb_dns_name + origin { + domain_name = local.lb_dns_name custom_origin_config { - http_port = 80 - https_port = 443 + http_port = 80 + https_port = 443 origin_keepalive_timeout = 5 - origin_protocol_policy = "http-only" - origin_read_timeout = 30 + origin_protocol_policy = "http-only" + origin_read_timeout = 30 origin_ssl_protocols = [ "TLSv1", "TLSv1.1", @@ -22,83 +23,83 @@ resource "aws_cloudfront_distribution" "data_api" { } ordered_cache_behavior { - path_pattern = "/" - target_origin_id = "load_balancer" - default_ttl = 0 - min_ttl = 0 - max_ttl = 31536000 # 1y - - allowed_methods = ["GET", "HEAD", "OPTIONS", "PUT", "POST", "PATCH", "DELETE"] - cached_methods = ["GET", "HEAD"] + path_pattern = "/" + target_origin_id = "load_balancer" + default_ttl = 0 + min_ttl = 0 + max_ttl = 31536000 # 1y + + allowed_methods = ["GET", "HEAD", "OPTIONS", "PUT", "POST", "PATCH", "DELETE"] + cached_methods = ["GET", "HEAD"] viewer_protocol_policy = "redirect-to-https" forwarded_values { - headers = ["Origin", "Access-Control-Request-Headers", "Access-Control-Request-Method", "x-api-key", "Referer", "Authorization"] - query_string = true + headers = ["Origin", "Access-Control-Request-Headers", "Access-Control-Request-Method", "x-api-key", "Referer", "Authorization"] + query_string = true cookies { - forward = "none" + forward = "none" whitelisted_names = [] } } } ordered_cache_behavior { - path_pattern = "/openapi.json" - target_origin_id = "load_balancer" - default_ttl = 0 - min_ttl = 0 - max_ttl = 31536000 # 1y - - allowed_methods = ["GET", "HEAD", "OPTIONS"] - cached_methods = ["GET", "HEAD"] + path_pattern = "/openapi.json" + target_origin_id = "load_balancer" + default_ttl = 0 + min_ttl = 0 + max_ttl = 31536000 # 1y + + allowed_methods = ["GET", "HEAD", "OPTIONS"] + cached_methods = ["GET", "HEAD"] viewer_protocol_policy = "redirect-to-https" forwarded_values { - headers = ["Origin", "Access-Control-Request-Headers", "Access-Control-Request-Method", "x-api-key", "Referer", "Authorization"] - query_string = true + headers = ["Origin", "Access-Control-Request-Headers", "Access-Control-Request-Method", "x-api-key", "Referer", "Authorization"] + query_string = true cookies { - forward = "none" + forward = "none" whitelisted_names = [] } } } origin { - domain_name = trimsuffix(trimprefix(aws_api_gateway_stage.api_gw_stage.invoke_url, "https://"), "/${local.api_gw_stage_name}") + domain_name = trimsuffix(trimprefix(var.api_gateway_url == "" ? module.api_gateway[0].invoke_url : var.api_gateway_url, "https://"), "/${var.api_gateway_stage_name}") custom_origin_config { - http_port = 80 - https_port = 443 + http_port = 80 + https_port = 443 origin_keepalive_timeout = 5 - origin_protocol_policy = "https-only" - origin_read_timeout = 30 + origin_protocol_policy = "https-only" + origin_read_timeout = 30 origin_ssl_protocols = [ "TLSv1", "TLSv1.1", "TLSv1.2", ] } - origin_id = "api_gateway" - origin_path = "/${local.api_gw_stage_name}" + origin_id = "api_gateway" + origin_path = "/${var.api_gateway_stage_name}" } default_cache_behavior { - target_origin_id = "api_gateway" - default_ttl = 0 - min_ttl = 0 - max_ttl = 31536000 # 1y + target_origin_id = "api_gateway" + default_ttl = 0 + min_ttl = 0 + max_ttl = 31536000 # 1y - allowed_methods = ["GET", "HEAD", "OPTIONS", "PUT", "POST", "PATCH", "DELETE"] - cached_methods = ["GET", "HEAD"] + allowed_methods = ["GET", "HEAD", "OPTIONS", "PUT", "POST", "PATCH", "DELETE"] + cached_methods = ["GET", "HEAD"] viewer_protocol_policy = "redirect-to-https" forwarded_values { - headers = ["Origin", "Access-Control-Request-Headers", "Access-Control-Request-Method", "x-api-key", "Referer", "Authorization"] - query_string = true + headers = ["Origin", "Access-Control-Request-Headers", "Access-Control-Request-Method", "x-api-key", "Referer", "Authorization"] + query_string = true cookies { - forward = "none" + forward = "none" whitelisted_names = [] } } diff --git a/terraform/data.tf b/terraform/data.tf index 8bf2582af..6016f0118 100644 --- a/terraform/data.tf +++ b/terraform/data.tf @@ -74,10 +74,10 @@ data "template_file" "container_definition" { api_token_secret_arn = data.terraform_remote_state.core.outputs.secrets_read-gfw-api-token_arn aws_gcs_key_secret_arn = data.terraform_remote_state.core.outputs.secrets_read-gfw-gee-export_arn - api_gateway_id = aws_api_gateway_rest_api.api_gw_api.id - api_gateway_internal_usage_plan = aws_api_gateway_usage_plan.internal.id - api_gateway_external_usage_plan = aws_api_gateway_usage_plan.external.id - api_gateway_stage_name = aws_api_gateway_stage.api_gw_stage.stage_name + api_gateway_id = var.api_gateway_id == "" ? module.api_gateway[0].api_gateway_id : var.api_gateway_id + api_gateway_external_usage_plan = var.api_gw_external_up_id == "" ? module.api_gateway[0].external_usage_plan_id : var.api_gw_external_up_id + api_gateway_internal_usage_plan = var.api_gw_internal_up_id == "" ? module.api_gateway[0].internal_usage_plan_id : var.api_gw_internal_up_id + api_gateway_stage_name = var.api_gateway_stage_name internal_domains = var.internal_domains # TODO move to core-infrastructure when operational @@ -170,3 +170,8 @@ data "aws_iam_policy_document" "read_new_relic_lic" { effect = "Allow" } } + +data "external" "generate_port" { + count = var.environment == "dev" ? 1 : 0 + program = ["python3", "${path.module}/generate_port.py", local.name_suffix, "30000", "31000"] +} diff --git a/terraform/main.tf b/terraform/main.tf index d9c6ea145..d0849c32a 100644 --- a/terraform/main.tf +++ b/terraform/main.tf @@ -27,7 +27,7 @@ locals { aurora_max_vcpus = local.aurora_instance_class == "db.t3.medium" ? 2 : local.aurora_instance_class == "db.r6g.large" ? 2 : local.aurora_instance_class == "db.r6g.xlarge" ? 4 : local.aurora_instance_class == "db.r6g.2xlarge" ? 8 : local.aurora_instance_class == "db.r6g.4xlarge" ? 16 : local.aurora_instance_class == "db.r6g.8xlarge" ? 32 : local.aurora_instance_class == "db.r6g.16xlarge" ? 64 : local.aurora_instance_class == "db.r5.large" ? 2 : local.aurora_instance_class == "db.r5.xlarge" ? 4 : local.aurora_instance_class == "db.r5.2xlarge" ? 8 : local.aurora_instance_class == "db.r5.4xlarge" ? 16 : local.aurora_instance_class == "db.r5.8xlarge" ? 32 : local.aurora_instance_class == "db.r5.12xlarge" ? 48 : local.aurora_instance_class == "db.r5.16xlarge" ? 64 : local.aurora_instance_class == "db.r5.24xlarge" ? 96 : "" service_url = var.environment == "dev" ? "http://${module.fargate_autoscaling.lb_dns_name}" : var.service_url container_tag = substr(var.git_sha, 0, 7) - api_gw_stage_name = substr("deploy${replace(local.name_suffix, "-", "_")}", 0, 64) + lb_dns_name = coalesce(module.fargate_autoscaling.lb_dns_name, var.lb_dns_name) } # Docker image for FastAPI app @@ -76,22 +76,26 @@ module "batch_tile_cache_image" { module "fargate_autoscaling" { - source = "git::https://github.com/wri/gfw-terraform-modules.git//terraform/modules/fargate_autoscaling?ref=v0.4.2.3" - project = local.project - name_suffix = local.name_suffix - tags = local.fargate_tags - vpc_id = data.terraform_remote_state.core.outputs.vpc_id - private_subnet_ids = data.terraform_remote_state.core.outputs.private_subnet_ids - public_subnet_ids = data.terraform_remote_state.core.outputs.public_subnet_ids - container_name = var.container_name - container_port = var.container_port - desired_count = var.desired_count - fargate_cpu = var.fargate_cpu - fargate_memory = var.fargate_memory - auto_scaling_cooldown = var.auto_scaling_cooldown - auto_scaling_max_capacity = var.auto_scaling_max_capacity - auto_scaling_max_cpu_util = var.auto_scaling_max_cpu_util - auto_scaling_min_capacity = var.auto_scaling_min_capacity + # source = "git::https://github.com/wri/gfw-terraform-modules.git//terraform/modules/fargate_autoscaling?ref=single-dev-lb" + source = "../gfw-terraform-modules/terraform/modules/fargate_autoscaling" + project = local.project + name_suffix = local.name_suffix + tags = local.fargate_tags + vpc_id = data.terraform_remote_state.core.outputs.vpc_id + private_subnet_ids = data.terraform_remote_state.core.outputs.private_subnet_ids + public_subnet_ids = data.terraform_remote_state.core.outputs.public_subnet_ids + container_name = var.container_name + container_port = var.container_port + desired_count = var.desired_count + fargate_cpu = var.fargate_cpu + fargate_memory = var.fargate_memory + load_balancer_arn = var.load_balancer_arn + load_balancer_security_group = var.load_balancer_security_group + listener_port = var.environment == "dev" ? data.external.generate_port[0].result.port : var.listener_port + auto_scaling_cooldown = var.auto_scaling_cooldown + auto_scaling_max_capacity = var.auto_scaling_max_capacity + auto_scaling_max_cpu_util = var.auto_scaling_max_cpu_util + auto_scaling_min_capacity = var.auto_scaling_min_capacity // acm_certificate_arn = var.environment == "dev" ? null : data.terraform_remote_state.core.outputs.acm_certificate security_group_ids = [data.terraform_remote_state.core.outputs.postgresql_security_group_id] task_role_policies = [ @@ -204,3 +208,13 @@ module "batch_job_queues" { aurora_max_vcpus = local.aurora_max_vcpus gcs_secret = data.terraform_remote_state.core.outputs.secrets_read-gfw-gee-export_arn } + +module "api_gateway" { + count = var.api_gateway_id == "" ? 1 : 0 + source = "./modules/api_gateway/gateway" + lb_dns_name = local.lb_dns_name + api_gateway_role_policy = data.template_file.api_gateway_role_policy.rendered + lambda_role_policy = data.template_file.lambda_role_policy.rendered + cloudwatch_policy = data.local_file.cloudwatch_log_policy.content + lambda_invoke_policy = data.local_file.iam_lambda_invoke.content +} diff --git a/terraform/outputs.tf b/terraform/outputs.tf index af15b104a..0681cb88c 100644 --- a/terraform/outputs.tf +++ b/terraform/outputs.tf @@ -1,3 +1,7 @@ output "loadbalancer_dns" { - value = module.fargate_autoscaling.lb_dns_name -} \ No newline at end of file + value = coalesce(module.fargate_autoscaling.lb_dns_name, var.lb_dns_name) +} + +output "generated_port" { + value = length(data.external.generate_port) > 0 ? data.external.generate_port[0].result["port"] : var.listener_port +} diff --git a/terraform/variables.tf b/terraform/variables.tf index ecf08bdab..8b56728b9 100644 --- a/terraform/variables.tf +++ b/terraform/variables.tf @@ -73,40 +73,86 @@ variable "data_lake_max_vcpus" { default = 576 } -variable "api_gateway_usage_plans" { - type = map(any) - description = "Throttling limits for API Gateway" - default = { - internal_apps = { - quota_limit = 10000 # per day - burst_limit = 100 # per second - rate_limit = 200 - } - external_apps = { - quota_limit = 500 - burst_limit = 10 - rate_limit = 20 - } - } -} - variable "internal_domains" { type = string description = "Comma separated list of client domains for which we set first tier rate limiting." default = "*.globalforestwatch.org,globalforestwatch.org,api.resourcewatch.org,my.gfw-mapbuilder.org,resourcewatch.org" } -variable "download_endpoints" { - type = list(string) - description = "path parts to download endpoints" - - # listing spatial endpoints as gateway needs them explicitly created - # in order to apply endpoint-level throttling to them - default = ["geotiff", "gpkg", "shp"] -} #TODO import from core-infrastructure when operational variable "new_relic_license_key_arn" { type = string description = "New Relic license key ARN" } + +variable "load_balancer_arn" { + type = string + default = "" + description = "Optional Load Balancer to use for fargate cluster. When left blank, a new LB will be created" +} + +variable "load_balancer_security_group" { + type = string + default = "" + description = "Optional secuirty group of load balancer with which the task can communicate. Required if load_blancer_arn is not empty" +} + +variable "listener_port" { + type = number + description = "The default port the Load Balancer should listen to. Will be ignored when acm_certificate is set." + default = 80 +} + +variable "lb_dns_name" { + type = string + default = "" + description = "DNS name of load balancer for API Gateway to forward requests to. API Gateway will first look for one from fargate autoscaling module output before using this." +} + +variable "create_cloudfront_distribution" { + type = bool + default = true +} + +variable "api_gateway_id" { + type = string + description = "ID of API Gateway instance" + default = "" +} + +variable "api_gw_internal_up_id" { + type = string + description = "ID of API Gateway usage plan for internal domains" + default = "" +} + +variable "api_gw_external_up_id" { + type = string + description = "ID of API Gateway usage plan for external domains" + default = "" +} + +variable "api_gateway_name" { + type = string + description = "Name of API Gateway instance" + default = "GFWDataAPIGateway" +} + +variable "api_gateway_description" { + type = string + description = "Description of API Gateway Instance" + default = "GFW Data API Gateway" +} + +variable "api_gateway_stage_name" { + type = string + description = "Deployment stage name of API Gateway instance" + default = "deploy" +} + +variable "api_gateway_url" { + type = string + description = "The invoke url of the API Gateway stage" + default = "" +} diff --git a/terraform/vars/terraform-dev.tfvars b/terraform/vars/terraform-dev.tfvars index 7a7bebdcd..0996b0efe 100644 --- a/terraform/vars/terraform-dev.tfvars +++ b/terraform/vars/terraform-dev.tfvars @@ -1,10 +1,18 @@ -environment = "dev" -log_level = "debug" -service_url = "https://dev-data-api.globalforestwatch.org" # fake, needed for CloudFront -rw_api_url = "https://api.resourcewatch.org" -desired_count = 1 -auto_scaling_min_capacity = 1 -auto_scaling_max_capacity = 5 -lambda_analysis_workspace = "features-lat_lon" -key_pair = "dmannarino_gfw" -new_relic_license_key_arn = "arn:aws:secretsmanager:us-east-1:563860007740:secret:newrelic/license_key-lolw24" +environment = "dev" +log_level = "debug" +service_url = "https://dev-data-api.globalforestwatch.org" # fake, needed for CloudFront +rw_api_url = "https://api.resourcewatch.org" +desired_count = 1 +auto_scaling_min_capacity = 1 +auto_scaling_max_capacity = 5 +lambda_analysis_workspace = "features-lat_lon" +key_pair = "dmannarino_gfw" +create_cloudfront_distribution = false +new_relic_license_key_arn = "arn:aws:secretsmanager:us-east-1:563860007740:secret:newrelic/license_key-lolw24" +load_balancer_security_group = "sg-07c9331c01f8da1c8" +load_balancer_arn = "arn:aws:elasticloadbalancing:us-east-1:563860007740:loadbalancer/app/gfw-data-api-elb-shared-dev-lb/77f2fc64dcdf9ebc" +lb_dns_name = "gfw-data-api-elb-shared-dev-lb-1032652585.us-east-1.elb.amazonaws.com" +api_gateway_id = "ett3a69n8h" +api_gw_external_up_id = "08ipvv" +api_gw_internal_up_id = "ahj8m0" +api_gateway_url = "https://ett3a69n8h.execute-api.us-east-1.amazonaws.com/deploy" From dd84b04ad08441dea17e84acf3347e7374e28ff4 Mon Sep 17 00:00:00 2001 From: Solomon Negusse Date: Mon, 26 Feb 2024 18:35:09 +0300 Subject: [PATCH 03/44] point to the remote module --- terraform/main.tf | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/terraform/main.tf b/terraform/main.tf index d0849c32a..2ceb7950b 100644 --- a/terraform/main.tf +++ b/terraform/main.tf @@ -76,8 +76,7 @@ module "batch_tile_cache_image" { module "fargate_autoscaling" { - # source = "git::https://github.com/wri/gfw-terraform-modules.git//terraform/modules/fargate_autoscaling?ref=single-dev-lb" - source = "../gfw-terraform-modules/terraform/modules/fargate_autoscaling" + source = "git::https://github.com/wri/gfw-terraform-modules.git//terraform/modules/fargate_autoscaling?ref=single-dev-lb" project = local.project name_suffix = local.name_suffix tags = local.fargate_tags From ece8168e15e58baab198251e89d8ceda4a645aa9 Mon Sep 17 00:00:00 2001 From: Solomon Negusse Date: Mon, 26 Feb 2024 23:10:14 +0300 Subject: [PATCH 04/44] enable distributed tracing and query parameter tracking for better debugging --- newrelic.ini | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/newrelic.ini b/newrelic.ini index e3f769ced..8585f1207 100644 --- a/newrelic.ini +++ b/newrelic.ini @@ -37,7 +37,7 @@ app_name = GFW Data API # New Relic offers distributed tracing for monitoring and analyzing modern # distributed systems.Enable distributed tracing. -distributed_tracing.enabled = +distributed_tracing.enabled = true # When "true", the agent collects performance data about your # application and reports this data to the New Relic UI at @@ -115,7 +115,7 @@ high_security = false # then add "request.parameters.*" to the "attributes.include" # setting. For details about attributes configuration, please # consult the documentation. -# attributes.include = request.parameters.* +attributes.include = request.parameters.* # The transaction tracer captures deep information about slow # transactions and sends this to the UI on a periodic basis. The From 662276bb07183eb721ed8d1326f1c0c42ec46db8 Mon Sep 17 00:00:00 2001 From: Solomon Negusse Date: Tue, 27 Feb 2024 20:41:42 +0300 Subject: [PATCH 05/44] add api gateway module and port generating script --- .gitignore | 2 +- terraform/generate_port.py | 18 + terraform/modules/api_gateway/gateway/main.tf | 337 ++++++++++++++++++ .../modules/api_gateway/gateway/outputs.tf | 15 + .../modules/api_gateway/gateway/variables.tf | 64 ++++ 5 files changed, 435 insertions(+), 1 deletion(-) create mode 100644 terraform/generate_port.py create mode 100644 terraform/modules/api_gateway/gateway/main.tf create mode 100644 terraform/modules/api_gateway/gateway/outputs.tf create mode 100644 terraform/modules/api_gateway/gateway/variables.tf diff --git a/.gitignore b/.gitignore index c65308245..9656ed575 100644 --- a/.gitignore +++ b/.gitignore @@ -21,7 +21,7 @@ tests/cobertura.xml tests_v2/cobertura.xml # Terraform stuff -terraform/* +**/.terraform/* # Virtual Environments .venv* \ No newline at end of file diff --git a/terraform/generate_port.py b/terraform/generate_port.py new file mode 100644 index 000000000..adfd3f284 --- /dev/null +++ b/terraform/generate_port.py @@ -0,0 +1,18 @@ +import sys +import random +import json + + +try: + input_string = sys.argv[1] + min_port = int(sys.argv[2]) + max_port = int(sys.argv[3]) + + random.seed(input_string) + port = random.randint(min_port, max_port) + + output = {"port": str(port)} + print(json.dumps(output)) +except Exception as e: + print(f"Error: {str(e)}", file=sys.stderr) + sys.exit(1) diff --git a/terraform/modules/api_gateway/gateway/main.tf b/terraform/modules/api_gateway/gateway/main.tf new file mode 100644 index 000000000..2dcb9d63e --- /dev/null +++ b/terraform/modules/api_gateway/gateway/main.tf @@ -0,0 +1,337 @@ +resource "aws_api_gateway_rest_api" "api_gw_api" { + name = var.name + description = var.description + api_key_source = "AUTHORIZER" # pragma: allowlist secret + + endpoint_configuration { + types = ["REGIONAL"] + } +} + +resource "aws_api_gateway_resource" "dataset_parent" { + rest_api_id = aws_api_gateway_rest_api.api_gw_api.id + parent_id = aws_api_gateway_rest_api.api_gw_api.root_resource_id + path_part = "dataset" +} + +resource "aws_api_gateway_resource" "dataset" { + rest_api_id = aws_api_gateway_rest_api.api_gw_api.id + parent_id = aws_api_gateway_resource.dataset_parent.id + path_part = "{dataset}" +} + +resource "aws_api_gateway_resource" "version" { + rest_api_id = aws_api_gateway_rest_api.api_gw_api.id + parent_id = aws_api_gateway_resource.dataset.id + path_part = "{version}" +} + +resource "aws_api_gateway_resource" "query_parent" { + rest_api_id = aws_api_gateway_rest_api.api_gw_api.id + parent_id = aws_api_gateway_resource.version.id + path_part = "query" +} + +module "query_resource" { + source = "../resource" + rest_api_id = aws_api_gateway_rest_api.api_gw_api.id + parent_id = aws_api_gateway_resource.query_parent.id + path_part = "{proxy+}" +} + +module "query_get" { + source = "../endpoint" + + rest_api_id = aws_api_gateway_rest_api.api_gw_api.id + authorizer_id = aws_api_gateway_authorizer.api_key.id + api_resource = module.query_resource.aws_api_gateway_resource + + require_api_key = false + http_method = "GET" + authorization = "NONE" + + integration_parameters = { + "integration.request.path.version" = "method.request.path.version" + "integration.request.path.dataset" = "method.request.path.dataset", + "integration.request.path.proxy" = "method.request.path.proxy" + } + + method_parameters = { + "method.request.path.dataset" = true, + "method.request.path.version" = true + "method.request.path.proxy" = true + + } + + integration_uri = "http://${var.lb_dns_name}/dataset/{dataset}/{version}/query/{proxy}" +} + +module "query_post" { + source = "../endpoint" + + rest_api_id = aws_api_gateway_rest_api.api_gw_api.id + authorizer_id = aws_api_gateway_authorizer.api_key.id + api_resource = module.query_resource.aws_api_gateway_resource + + require_api_key = false + http_method = "POST" + authorization = "NONE" + + integration_parameters = { + "integration.request.path.version" = "method.request.path.version" + "integration.request.path.dataset" = "method.request.path.dataset", + "integration.request.path.proxy" = "method.request.path.proxy" + } + + method_parameters = { + "method.request.path.dataset" = true, + "method.request.path.version" = true + "method.request.path.proxy" = true + + } + + integration_uri = "http://${var.lb_dns_name}/dataset/{dataset}/{version}/query/{proxy}" +} + +resource "aws_api_gateway_resource" "download_parent" { + rest_api_id = aws_api_gateway_rest_api.api_gw_api.id + parent_id = aws_api_gateway_resource.version.id + path_part = "download" +} + +module "download_shapes_resources" { + source = "../resource" + + rest_api_id = aws_api_gateway_rest_api.api_gw_api.id + parent_id = aws_api_gateway_resource.download_parent.id + + for_each = toset(var.download_endpoints) + path_part = each.key +} + +module "download_shapes_endpoint" { + source = "../endpoint" + + rest_api_id = aws_api_gateway_rest_api.api_gw_api.id + authorizer_id = aws_api_gateway_authorizer.api_key.id + + for_each = module.download_shapes_resources + api_resource = each.value.aws_api_gateway_resource + + require_api_key = true + http_method = "GET" + authorization = "CUSTOM" + + integration_parameters = { + "integration.request.path.dataset" = "method.request.path.dataset", + "integration.request.path.version" = "method.request.path.version" + } + + method_parameters = { + "method.request.path.dataset" = true, + "method.request.path.version" = true + } + + integration_uri = "http://${var.lb_dns_name}/dataset/{dataset}/{version}/download/${each.key}" +} + +module "unprotected_resource" { + source = "../resource" + + rest_api_id = aws_api_gateway_rest_api.api_gw_api.id + parent_id = aws_api_gateway_rest_api.api_gw_api.root_resource_id + path_part = "{proxy+}" + +} + +module "unprotected_endpoints" { + source = "../endpoint" + + rest_api_id = aws_api_gateway_rest_api.api_gw_api.id + authorizer_id = aws_api_gateway_authorizer.api_key.id + api_resource = module.unprotected_resource.aws_api_gateway_resource + + + require_api_key = false + http_method = "ANY" + authorization = "NONE" + + method_parameters = { "method.request.path.proxy" = true } + integration_parameters = { "integration.request.path.proxy" = "method.request.path.proxy" } + + integration_uri = "http://${var.lb_dns_name}/{proxy}" +} + + +resource "aws_api_gateway_usage_plan" "internal" { + name = substr("internal_apps", 0, 64) + + api_stages { + api_id = aws_api_gateway_rest_api.api_gw_api.id + stage = aws_api_gateway_stage.api_gw_stage.stage_name + } + + quota_settings { + limit = var.api_gateway_usage_plans.internal_apps.quota_limit + period = "DAY" + } + + throttle_settings { + burst_limit = var.api_gateway_usage_plans.internal_apps.burst_limit + rate_limit = var.api_gateway_usage_plans.internal_apps.rate_limit + } + + # terraform doesn't expose API Gateway's method level throttling so will do that + # manually and this will stop terraform from destroying the manual changes + # Open PR to add the feature to terraform: https://github.com/hashicorp/terraform-provider-aws/pull/20672 + lifecycle { + ignore_changes = all + } +} + +resource "aws_api_gateway_usage_plan" "external" { + name = substr("external_apps", 0, 64) + + api_stages { + api_id = aws_api_gateway_rest_api.api_gw_api.id + stage = aws_api_gateway_stage.api_gw_stage.stage_name + } + + quota_settings { + limit = var.api_gateway_usage_plans.external_apps.quota_limit + period = "DAY" + } + + throttle_settings { + burst_limit = var.api_gateway_usage_plans.external_apps.burst_limit + rate_limit = var.api_gateway_usage_plans.external_apps.rate_limit + } + + # terraform doesn't expose API Gateway's method level throttling so will do that + # manually and this will stop terraform from destroying the manual changes + # Open PR to add the feature to terraform: https://github.com/hashicorp/terraform-provider-aws/pull/20672 + lifecycle { + ignore_changes = all + } + +} + +resource "aws_api_gateway_deployment" "api_gw_dep" { + rest_api_id = aws_api_gateway_rest_api.api_gw_api.id + + triggers = { + redeployment = "${md5(file("main.tf"))}-${md5(file("${path.module}/../endpoint/main.tf"))}-${md5(file("${path.module}/../resource/main.tf"))}" + } + + depends_on = [ + module.query_get.integration_point, + module.query_post.integration_point, + #FIXME don't hardcode the spatial integration points + module.download_shapes_endpoint["shp"].integration_point, + module.download_shapes_endpoint["gpkg"].integration_point, + module.download_shapes_endpoint["geotiff"].integration_point, + module.unprotected_endpoints.integration_point + ] + + lifecycle { + create_before_destroy = true + } +} + +resource "aws_api_gateway_stage" "api_gw_stage" { + deployment_id = aws_api_gateway_deployment.api_gw_dep.id + rest_api_id = aws_api_gateway_rest_api.api_gw_api.id + stage_name = var.stage_name +} + +# Lambda Authorizer +resource "aws_api_gateway_authorizer" "api_key" { + name = "api_key" + rest_api_id = aws_api_gateway_rest_api.api_gw_api.id + type = "REQUEST" + authorizer_uri = aws_lambda_function.authorizer.invoke_arn + authorizer_credentials = aws_iam_role.invocation_role.arn + authorizer_result_ttl_in_seconds = 0 + + # making sure terraform doesn't require default authorization + # header (https://github.com/hashicorp/terraform-provider-aws/issues/5845) + identity_source = "," +} + + +resource "aws_iam_role" "invocation_role" { + name = substr("api_gateway_auth_invocation", 0, 64) + path = "/" + + assume_role_policy = var.api_gateway_role_policy +} + +resource "aws_iam_role_policy" "invocation_policy" { + name = "default" + role = aws_iam_role.invocation_role.id + + policy = var.lambda_invoke_policy +} + +resource "aws_iam_role" "lambda" { + name = substr("api_gw_authorizer_lambda", 0, 64) + + assume_role_policy = var.lambda_role_policy +} + +resource "aws_lambda_function" "authorizer" { + filename = "api_gateway/api_key_authorizer_lambda.zip" + function_name = substr("api_gateway_authorizer", 0, 64) + runtime = "python3.8" + role = aws_iam_role.lambda.arn + handler = "lambda_function.handler" + + source_code_hash = filebase64sha256("api_gateway/api_key_authorizer_lambda.zip") + + depends_on = [ + aws_iam_role.cloudwatch + ] +} + + +# Cloudwatch Logging +resource "aws_api_gateway_account" "main" { + cloudwatch_role_arn = aws_iam_role.cloudwatch.arn +} + +resource "aws_iam_role" "cloudwatch" { + name = substr("api_gateway_cloudwatch_global", 0, 64) + + assume_role_policy = var.api_gateway_role_policy +} + +resource "aws_iam_role_policy" "api_gw_cloudwatch" { + name = "default" + role = aws_iam_role.cloudwatch.id + + policy = var.cloudwatch_policy +} + +resource "aws_iam_role_policy" "lambda_cloudwatch" { + name = "default" + role = aws_iam_role.lambda.id + + policy = var.cloudwatch_policy +} + +resource "aws_api_gateway_method_settings" "general_settings" { + rest_api_id = aws_api_gateway_rest_api.api_gw_api.id + stage_name = aws_api_gateway_stage.api_gw_stage.stage_name + method_path = "*/*" + + settings { + # Enable CloudWatch logging and metrics + metrics_enabled = true + data_trace_enabled = true + logging_level = "INFO" + } + + depends_on = [ + aws_iam_role.cloudwatch + ] +} diff --git a/terraform/modules/api_gateway/gateway/outputs.tf b/terraform/modules/api_gateway/gateway/outputs.tf new file mode 100644 index 000000000..d74ec3c60 --- /dev/null +++ b/terraform/modules/api_gateway/gateway/outputs.tf @@ -0,0 +1,15 @@ +output "internal_usage_plan_id" { + value = aws_api_gateway_usage_plan.internal.id +} + +output "external_usage_plan_id" { + value = aws_api_gateway_usage_plan.external.id +} + +output "api_gateway_id" { + value = aws_api_gateway_rest_api.api_gw_api.id +} + +output "invoke_url" { + value = aws_api_gateway_stage.api_gw_stage.invoke_url +} diff --git a/terraform/modules/api_gateway/gateway/variables.tf b/terraform/modules/api_gateway/gateway/variables.tf new file mode 100644 index 000000000..3f820a5cf --- /dev/null +++ b/terraform/modules/api_gateway/gateway/variables.tf @@ -0,0 +1,64 @@ +variable "name" { + type = string + description = "Name of API Gateway instance" + default = "GFWDataAPIGateway" +} + +variable "description" { + type = string + description = "Description of API Gateway Instance" + default = "GFW Data API Gateway" +} + +variable "stage_name" { + type = string + description = "The stage under which the instance will be deployed" + default = "deploy" +} + +variable "download_endpoints" { + type = list(string) + description = "path parts to download endpoints" + + # listing spatial endpoints as gateway needs them explicitly created + # in order to apply endpoint-level throttling to them + default = ["geotiff", "gpkg", "shp"] +} + +variable "lb_dns_name" { + type = string + description = "Application load balancer to forward requests to" +} + +variable "api_gateway_role_policy" { + type = string +} + +variable "lambda_role_policy" { + type = string +} + +variable "cloudwatch_policy" { + type = string +} + +variable "lambda_invoke_policy" { + type = string +} + +variable "api_gateway_usage_plans" { + type = map(any) + description = "Throttling limits for API Gateway" + default = { + internal_apps = { + quota_limit = 10000 # per day + burst_limit = 100 # per second + rate_limit = 200 + } + external_apps = { + quota_limit = 500 + burst_limit = 10 + rate_limit = 20 + } + } +} From c9cdca70c08c7d7d3bc1166c7adcb808d4351ad8 Mon Sep 17 00:00:00 2001 From: Daniel Mannarino Date: Tue, 27 Feb 2024 12:42:40 -0500 Subject: [PATCH 06/44] Preserve vector input order to allow manual geom layering --- batch/scripts/create_vector_tile_cache.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/batch/scripts/create_vector_tile_cache.sh b/batch/scripts/create_vector_tile_cache.sh index ee96564ca..4fe2547e7 100755 --- a/batch/scripts/create_vector_tile_cache.sh +++ b/batch/scripts/create_vector_tile_cache.sh @@ -34,7 +34,7 @@ echo "Fetch NDJSON data from Data Lake ${SRC} -> ${DATASET}" aws s3 cp "${SRC}" "${DATASET}" --no-progress echo "Build Tile Cache" -tippecanoe -Z"${MIN_ZOOM}" -z"${MAX_ZOOM}" -e tilecache "${STRATEGY[@]}" -P -n "${DATASET}" "${DATASET}" +tippecanoe -Z"${MIN_ZOOM}" -z"${MAX_ZOOM}" -e tilecache "${STRATEGY[@]}" -P -n "${DATASET}" "${DATASET}" --preserve-input-order echo "Upload tiles to S3" tileputty tilecache --bucket "${TILE_CACHE}" --dataset "${DATASET}" --version "${VERSION}" --implementation "${IMPLEMENTATION}" --cores "${NUM_PROCESSES}" \ No newline at end of file From b1d7f33c01d47a9e33b7ebb689e4def9deaba879 Mon Sep 17 00:00:00 2001 From: Solomon Negusse Date: Tue, 27 Feb 2024 20:42:55 +0300 Subject: [PATCH 07/44] more terraform gitignore rules --- .gitignore | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/.gitignore b/.gitignore index 9656ed575..3b9984c8c 100644 --- a/.gitignore +++ b/.gitignore @@ -23,5 +23,12 @@ tests_v2/cobertura.xml # Terraform stuff **/.terraform/* +# .tfstate files +*.tfstate +*.tfstate.* + +# .tfplan files +*.tfplan + # Virtual Environments .venv* \ No newline at end of file From 9e025bc89e12c6bc0af5dc28a0ae1bd91cd7dd27 Mon Sep 17 00:00:00 2001 From: Solomon Negusse Date: Wed, 28 Feb 2024 11:14:37 +0300 Subject: [PATCH 08/44] updated arns --- terraform/vars/terraform-dev.tfvars | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/terraform/vars/terraform-dev.tfvars b/terraform/vars/terraform-dev.tfvars index 0996b0efe..92024ec8d 100644 --- a/terraform/vars/terraform-dev.tfvars +++ b/terraform/vars/terraform-dev.tfvars @@ -10,9 +10,9 @@ key_pair = "dmannarino_gfw" create_cloudfront_distribution = false new_relic_license_key_arn = "arn:aws:secretsmanager:us-east-1:563860007740:secret:newrelic/license_key-lolw24" load_balancer_security_group = "sg-07c9331c01f8da1c8" -load_balancer_arn = "arn:aws:elasticloadbalancing:us-east-1:563860007740:loadbalancer/app/gfw-data-api-elb-shared-dev-lb/77f2fc64dcdf9ebc" -lb_dns_name = "gfw-data-api-elb-shared-dev-lb-1032652585.us-east-1.elb.amazonaws.com" -api_gateway_id = "ett3a69n8h" -api_gw_external_up_id = "08ipvv" -api_gw_internal_up_id = "ahj8m0" -api_gateway_url = "https://ett3a69n8h.execute-api.us-east-1.amazonaws.com/deploy" +load_balancer_arn = "arn:aws:elasticloadbalancing:us-east-1:563860007740:loadbalancer/app/gfw-data-api-elb-shared-dev-lb/60c3ad42ca6522e3" +lb_dns_name = "gfw-data-api-elb-shared-dev-lb-10091095.us-east-1.elb.amazonaws.com" +api_gateway_id = "ett3a69n8h" +api_gw_external_up_id = "08ipvv" +api_gw_internal_up_id = "ahj8m0" +api_gateway_url = "https://ett3a69n8h.execute-api.us-east-1.amazonaws.com/deploy" From bb1322ecd64c9f163cddd1bfbd31ef441cec19a1 Mon Sep 17 00:00:00 2001 From: Daniel Mannarino Date: Wed, 28 Feb 2024 14:24:59 -0500 Subject: [PATCH 09/44] Restrict GHA build actions to exact match of branch names --- .github/workflows/terraform_build.yaml | 8 +++++--- .github/workflows/terraform_plan.yaml | 5 +++-- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/.github/workflows/terraform_build.yaml b/.github/workflows/terraform_build.yaml index 17b1a394f..6a9cf67a2 100644 --- a/.github/workflows/terraform_build.yaml +++ b/.github/workflows/terraform_build.yaml @@ -20,6 +20,7 @@ jobs: with: project-token: ${{ secrets.CODACY_PROJECT_TOKEN }} coverage-reports: tests/cobertura.xml, tests_v2/cobertura.xml + - name: Run CodeCOV action uses: codecov/codecov-action@v1 with: @@ -29,8 +30,9 @@ jobs: name: codecov-umbrella fail_ci_if_error: false verbose: false + - name: Deploy production - if: success() && contains(github.ref, 'master') + if: success() && github.ref == 'refs/heads/master' env: ENV: production AWS_ACCESS_KEY_ID: ${{ secrets.aws_key_production }} @@ -41,7 +43,7 @@ jobs: ./scripts/infra apply - name: Deploy staging - if: success() && contains(github.ref, 'develop') + if: success() && github.ref == 'refs/heads/develop' env: ENV: staging AWS_ACCESS_KEY_ID: ${{ secrets.aws_key_staging }} @@ -52,7 +54,7 @@ jobs: ./scripts/infra apply - name: Deploy dev - if: success() && (! contains(github.ref, 'develop')) && (! contains(github.ref, 'master')) + if: success() && (! github.ref == 'refs/heads/master' && github.ref == 'refs/heads/develop') env: ENV: dev AWS_ACCESS_KEY_ID: ${{ secrets.aws_key_dev }} diff --git a/.github/workflows/terraform_plan.yaml b/.github/workflows/terraform_plan.yaml index abba75df6..b70445069 100644 --- a/.github/workflows/terraform_plan.yaml +++ b/.github/workflows/terraform_plan.yaml @@ -9,15 +9,16 @@ jobs: steps: - uses: actions/checkout@v1 - name: Plan production - if: success() && contains(github.base_ref, 'master') + if: success() && github.ref == 'refs/heads/master' env: ENV: production AWS_ACCESS_KEY_ID: ${{ secrets.aws_key_production }} AWS_SECRET_ACCESS_KEY: ${{ secrets.aws_secret_production }} AWS_REGION: ${{ secrets.aws_region_production }} run: ./scripts/infra plan -w ${{ github.base_ref }} + - name: Plan staging - if: success() && contains(github.base_ref, 'develop') + if: success() && github.ref == 'refs/heads/develop' env: ENV: staging AWS_ACCESS_KEY_ID: ${{ secrets.aws_key_staging }} From b33569e396af429bdacae25b08fbbecb62ffc167 Mon Sep 17 00:00:00 2001 From: Daniel Mannarino Date: Wed, 28 Feb 2024 14:26:05 -0500 Subject: [PATCH 10/44] Add special case to delete merge-from-master workspace --- .../workflows/terraform_destroy_on_delete.yaml | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/.github/workflows/terraform_destroy_on_delete.yaml b/.github/workflows/terraform_destroy_on_delete.yaml index 171d2b337..405207928 100644 --- a/.github/workflows/terraform_destroy_on_delete.yaml +++ b/.github/workflows/terraform_destroy_on_delete.yaml @@ -4,7 +4,7 @@ on: [delete] jobs: build: - if: contains(github.event.ref_type, 'branch') && (! contains(github.event.ref, 'master')) && (! contains(github.event.ref, 'develop')) + if: contains(github.event.ref_type, 'branch') && (! github.ref == 'refs/heads/master') && (! github.ref == 'refs/heads/develop') runs-on: ubuntu-latest steps: - uses: actions/checkout@v1 @@ -15,3 +15,16 @@ jobs: AWS_SECRET_ACCESS_KEY: ${{ secrets.aws_secret_dev }} AWS_REGION: ${{ secrets.aws_region_dev }} run: ./scripts/delete_workspace -w ${{ github.event.ref }} -g "no_sha_available" + + build-temp: + if: contains(github.event.ref_type, 'branch') && github.ref == 'refs/heads/merge-from-master' + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v1 + - name: Destroy state and delete workspace + env: + ENV: production + AWS_ACCESS_KEY_ID: ${{ secrets.aws_key_production }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.aws_secret_production }} + AWS_REGION: ${{ secrets.aws_region_production }} + run: ./scripts/delete_workspace -w ${{ github.event.ref }} -g "no_sha_available" From 7375a2757fd47d1c65aac160da1d926d53f95dc3 Mon Sep 17 00:00:00 2001 From: Daniel Mannarino Date: Wed, 28 Feb 2024 14:43:57 -0500 Subject: [PATCH 11/44] Use ref_name instead of ref --- .github/workflows/terraform_destroy_on_delete.yaml | 4 ++-- .github/workflows/terraform_plan.yaml | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/terraform_destroy_on_delete.yaml b/.github/workflows/terraform_destroy_on_delete.yaml index 405207928..1858de96f 100644 --- a/.github/workflows/terraform_destroy_on_delete.yaml +++ b/.github/workflows/terraform_destroy_on_delete.yaml @@ -4,7 +4,7 @@ on: [delete] jobs: build: - if: contains(github.event.ref_type, 'branch') && (! github.ref == 'refs/heads/master') && (! github.ref == 'refs/heads/develop') + if: contains(github.event.ref_type, 'branch') && (! github.ref_name == 'master') && (! github.ref_name == 'develop') runs-on: ubuntu-latest steps: - uses: actions/checkout@v1 @@ -17,7 +17,7 @@ jobs: run: ./scripts/delete_workspace -w ${{ github.event.ref }} -g "no_sha_available" build-temp: - if: contains(github.event.ref_type, 'branch') && github.ref == 'refs/heads/merge-from-master' + if: contains(github.event.ref_type, 'branch') && github.ref_name == 'merge-from-master' runs-on: ubuntu-latest steps: - uses: actions/checkout@v1 diff --git a/.github/workflows/terraform_plan.yaml b/.github/workflows/terraform_plan.yaml index b70445069..f45a456ee 100644 --- a/.github/workflows/terraform_plan.yaml +++ b/.github/workflows/terraform_plan.yaml @@ -9,7 +9,7 @@ jobs: steps: - uses: actions/checkout@v1 - name: Plan production - if: success() && github.ref == 'refs/heads/master' + if: success() && github.ref_name == 'master' env: ENV: production AWS_ACCESS_KEY_ID: ${{ secrets.aws_key_production }} @@ -18,7 +18,7 @@ jobs: run: ./scripts/infra plan -w ${{ github.base_ref }} - name: Plan staging - if: success() && github.ref == 'refs/heads/develop' + if: success() && github.ref_name == 'develop' env: ENV: staging AWS_ACCESS_KEY_ID: ${{ secrets.aws_key_staging }} From 5ca921753fddf89933a2a983b1f95850e267532d Mon Sep 17 00:00:00 2001 From: Daniel Mannarino Date: Wed, 28 Feb 2024 14:48:32 -0500 Subject: [PATCH 12/44] Correct destroy scripts --- .github/workflows/terraform_destroy_on_delete.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/terraform_destroy_on_delete.yaml b/.github/workflows/terraform_destroy_on_delete.yaml index 1858de96f..6f8b26dcf 100644 --- a/.github/workflows/terraform_destroy_on_delete.yaml +++ b/.github/workflows/terraform_destroy_on_delete.yaml @@ -4,7 +4,7 @@ on: [delete] jobs: build: - if: contains(github.event.ref_type, 'branch') && (! github.ref_name == 'master') && (! github.ref_name == 'develop') + if: contains(github.event.ref_type, 'branch') && (! github.event.ref_name == 'master') && (! github.event.ref_name == 'develop') runs-on: ubuntu-latest steps: - uses: actions/checkout@v1 @@ -17,7 +17,7 @@ jobs: run: ./scripts/delete_workspace -w ${{ github.event.ref }} -g "no_sha_available" build-temp: - if: contains(github.event.ref_type, 'branch') && github.ref_name == 'merge-from-master' + if: contains(github.event.ref_type, 'branch') && github.event.ref_name == 'merge-from-master' runs-on: ubuntu-latest steps: - uses: actions/checkout@v1 From 07a1d807645a48f2b66822d8ec355a5dfb82ea05 Mon Sep 17 00:00:00 2001 From: Dan Scales Date: Wed, 28 Feb 2024 14:57:30 -0800 Subject: [PATCH 13/44] GTC-2649 Include asset_id in the version get endpoint The version get endpoint already includes asset_type and asset_uri, but it would be more useful if it also included the associated asset_id. I believe it is fine to modify Version. In some cases (like add_new_version) where a new asset is being created in the background, the _version_response may have an asset with an empty asset_id, but this seems to be all fine (all tests pass). But let me know if you think I should leave Version unmodified and create a separate type only for the output of get_inversion that includes the asset_id. --- app/models/pydantic/versions.py | 3 ++- app/routes/datasets/versions.py | 4 ++-- tests_v2/unit/app/routes/datasets/test_version.py | 9 +++++++++ 3 files changed, 13 insertions(+), 3 deletions(-) diff --git a/app/models/pydantic/versions.py b/app/models/pydantic/versions.py index 53c665c86..908256527 100644 --- a/app/models/pydantic/versions.py +++ b/app/models/pydantic/versions.py @@ -17,7 +17,8 @@ class Version(BaseRecord): metadata: Union[VersionMetadataOut, BaseModel] status: VersionStatus = VersionStatus.pending - assets: List[Tuple[str, str]] = list() + # Each element of assets is a tuple (asset_type, assert_uri, asset_id) + assets: List[Tuple[str, str, str]] = list() class VersionCreateIn(StrictBaseModel): diff --git a/app/routes/datasets/versions.py b/app/routes/datasets/versions.py index 9f54c7d25..ae4696138 100644 --- a/app/routes/datasets/versions.py +++ b/app/routes/datasets/versions.py @@ -500,14 +500,14 @@ async def _version_response( associated assets.""" assets: List[ORMAsset] = ( - await ORMAsset.select("asset_type", "asset_uri") + await ORMAsset.select("asset_type", "asset_uri", "asset_id") .where(ORMAsset.dataset == dataset) .where(ORMAsset.version == version) .where(ORMAsset.status == AssetStatus.saved) .gino.all() ) data = Version.from_orm(data).dict(by_alias=True) - data["assets"] = [(asset[0], asset[1]) for asset in assets] + data["assets"] = [(asset[0], asset[1], str(asset[2])) for asset in assets] return VersionResponse(data=Version(**data)) diff --git a/tests_v2/unit/app/routes/datasets/test_version.py b/tests_v2/unit/app/routes/datasets/test_version.py index 19f5b33bf..8d4b570c6 100644 --- a/tests_v2/unit/app/routes/datasets/test_version.py +++ b/tests_v2/unit/app/routes/datasets/test_version.py @@ -1,6 +1,7 @@ import pytest from _pytest.monkeypatch import MonkeyPatch from httpx import AsyncClient +import re from app.routes.datasets import versions from app.tasks import batch @@ -17,6 +18,14 @@ async def test_get_version(async_client: AsyncClient, generic_vector_source_vers assert resp.status_code == 200 data = resp.json() assert_jsend(data) + first_asset = data["data"]["assets"][0] + assert len(first_asset) == 3 + assert first_asset[0] == "Geo database table" + # Check asset_id looks reasonable + asset_id = first_asset[2] + assert len(asset_id) > 30 + pattern = re.compile(r'^[a-zA-Z0-9-]+$') + assert bool(pattern.match(asset_id)) @pytest.mark.asyncio From 37d990c27a08082852e2b5e167a470398a2561de Mon Sep 17 00:00:00 2001 From: Daniel Mannarino Date: Fri, 1 Mar 2024 04:31:43 -0500 Subject: [PATCH 14/44] Fix error caught by Solomon in review --- .github/workflows/terraform_plan.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/terraform_plan.yaml b/.github/workflows/terraform_plan.yaml index f45a456ee..5102d9339 100644 --- a/.github/workflows/terraform_plan.yaml +++ b/.github/workflows/terraform_plan.yaml @@ -9,7 +9,7 @@ jobs: steps: - uses: actions/checkout@v1 - name: Plan production - if: success() && github.ref_name == 'master' + if: success() && github.base_ref == 'master' env: ENV: production AWS_ACCESS_KEY_ID: ${{ secrets.aws_key_production }} @@ -18,7 +18,7 @@ jobs: run: ./scripts/infra plan -w ${{ github.base_ref }} - name: Plan staging - if: success() && github.ref_name == 'develop' + if: success() && github.base_ref == 'develop' env: ENV: staging AWS_ACCESS_KEY_ID: ${{ secrets.aws_key_staging }} From 090e0e79f8549a1a7fc095408280241df440b174 Mon Sep 17 00:00:00 2001 From: Solomon Negusse Date: Mon, 4 Mar 2024 17:31:49 +0300 Subject: [PATCH 15/44] New Relic distributed tracing 2nd pass --- newrelic.ini | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/newrelic.ini b/newrelic.ini index 8585f1207..02d9ced14 100644 --- a/newrelic.ini +++ b/newrelic.ini @@ -37,7 +37,7 @@ app_name = GFW Data API # New Relic offers distributed tracing for monitoring and analyzing modern # distributed systems.Enable distributed tracing. -distributed_tracing.enabled = true +; distributed_tracing.enabled = false # When "true", the agent collects performance data about your # application and reports this data to the New Relic UI at @@ -203,8 +203,10 @@ monitor_mode = false [newrelic:staging] app_name = GFW Data API (Staging) monitor_mode = true +distributed_tracing.enabled = false [newrelic:production] monitor_mode = true +distributed_tracing.enabled = true # --------------------------------------------------------------------------- \ No newline at end of file From 778d8e593252d8d3b3eb8d612f5456609ffd3bbb Mon Sep 17 00:00:00 2001 From: Solomon Negusse Date: Mon, 4 Mar 2024 18:15:45 +0300 Subject: [PATCH 16/44] fix comment linecharacter --- newrelic.ini | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/newrelic.ini b/newrelic.ini index 02d9ced14..6e6083fd2 100644 --- a/newrelic.ini +++ b/newrelic.ini @@ -37,7 +37,7 @@ app_name = GFW Data API # New Relic offers distributed tracing for monitoring and analyzing modern # distributed systems.Enable distributed tracing. -; distributed_tracing.enabled = false +# distributed_tracing.enabled = false # When "true", the agent collects performance data about your # application and reports this data to the New Relic UI at From cc38f6e3246c3f2f4b0d19f15a5f0001d11715e7 Mon Sep 17 00:00:00 2001 From: Daniel Mannarino Date: Mon, 4 Mar 2024 14:18:04 -0500 Subject: [PATCH 17/44] Correct GHA references --- .github/workflows/terraform_destroy_on_delete.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/terraform_destroy_on_delete.yaml b/.github/workflows/terraform_destroy_on_delete.yaml index 6f8b26dcf..805f3049a 100644 --- a/.github/workflows/terraform_destroy_on_delete.yaml +++ b/.github/workflows/terraform_destroy_on_delete.yaml @@ -4,7 +4,7 @@ on: [delete] jobs: build: - if: contains(github.event.ref_type, 'branch') && (! github.event.ref_name == 'master') && (! github.event.ref_name == 'develop') + if: contains(github.event.ref_type, 'branch') && (! github.event.ref == 'refs/heads/master') && (! github.event.ref == 'refs/heads/develop') runs-on: ubuntu-latest steps: - uses: actions/checkout@v1 @@ -17,7 +17,7 @@ jobs: run: ./scripts/delete_workspace -w ${{ github.event.ref }} -g "no_sha_available" build-temp: - if: contains(github.event.ref_type, 'branch') && github.event.ref_name == 'merge-from-master' + if: contains(github.event.ref_type, 'branch') && github.event.ref == 'refs/heads/merge-from-master' runs-on: ubuntu-latest steps: - uses: actions/checkout@v1 From 300ba91d26d6633d5e258057f06f67135e5bc0e3 Mon Sep 17 00:00:00 2001 From: Daniel Mannarino Date: Mon, 4 Mar 2024 14:31:33 -0500 Subject: [PATCH 18/44] Correct GHA references, hopefully last time --- .github/workflows/terraform_destroy_on_delete.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/terraform_destroy_on_delete.yaml b/.github/workflows/terraform_destroy_on_delete.yaml index 805f3049a..d814e8a70 100644 --- a/.github/workflows/terraform_destroy_on_delete.yaml +++ b/.github/workflows/terraform_destroy_on_delete.yaml @@ -4,7 +4,7 @@ on: [delete] jobs: build: - if: contains(github.event.ref_type, 'branch') && (! github.event.ref == 'refs/heads/master') && (! github.event.ref == 'refs/heads/develop') + if: contains(github.event.ref_type, 'branch') && (! github.event.ref == 'master') && (! github.event.ref == 'develop') runs-on: ubuntu-latest steps: - uses: actions/checkout@v1 @@ -17,7 +17,7 @@ jobs: run: ./scripts/delete_workspace -w ${{ github.event.ref }} -g "no_sha_available" build-temp: - if: contains(github.event.ref_type, 'branch') && github.event.ref == 'refs/heads/merge-from-master' + if: contains(github.event.ref_type, 'branch') && github.event.ref == 'merge-from-master' runs-on: ubuntu-latest steps: - uses: actions/checkout@v1 From 6521be49c1f638932bf9714d66b099552d3bed58 Mon Sep 17 00:00:00 2001 From: Dan Scales Date: Wed, 28 Feb 2024 14:57:30 -0800 Subject: [PATCH 19/44] GTC-2649 Include asset_id in the version get endpoint The version get endpoint already includes asset_type and asset_uri, but it would be more useful if it also included the associated asset_id. I believe it is fine to modify Version. In some cases (like add_new_version) where a new asset is being created in the background, the _version_response may have an asset with an empty asset_id, but this seems to be all fine (all tests pass). But let me know if you think I should leave Version unmodified and create a separate type only for the output of get_inversion that includes the asset_id. --- app/models/pydantic/versions.py | 3 ++- app/routes/datasets/versions.py | 4 ++-- tests_v2/unit/app/routes/datasets/test_version.py | 9 +++++++++ 3 files changed, 13 insertions(+), 3 deletions(-) diff --git a/app/models/pydantic/versions.py b/app/models/pydantic/versions.py index 53c665c86..908256527 100644 --- a/app/models/pydantic/versions.py +++ b/app/models/pydantic/versions.py @@ -17,7 +17,8 @@ class Version(BaseRecord): metadata: Union[VersionMetadataOut, BaseModel] status: VersionStatus = VersionStatus.pending - assets: List[Tuple[str, str]] = list() + # Each element of assets is a tuple (asset_type, assert_uri, asset_id) + assets: List[Tuple[str, str, str]] = list() class VersionCreateIn(StrictBaseModel): diff --git a/app/routes/datasets/versions.py b/app/routes/datasets/versions.py index 9f54c7d25..ae4696138 100644 --- a/app/routes/datasets/versions.py +++ b/app/routes/datasets/versions.py @@ -500,14 +500,14 @@ async def _version_response( associated assets.""" assets: List[ORMAsset] = ( - await ORMAsset.select("asset_type", "asset_uri") + await ORMAsset.select("asset_type", "asset_uri", "asset_id") .where(ORMAsset.dataset == dataset) .where(ORMAsset.version == version) .where(ORMAsset.status == AssetStatus.saved) .gino.all() ) data = Version.from_orm(data).dict(by_alias=True) - data["assets"] = [(asset[0], asset[1]) for asset in assets] + data["assets"] = [(asset[0], asset[1], str(asset[2])) for asset in assets] return VersionResponse(data=Version(**data)) diff --git a/tests_v2/unit/app/routes/datasets/test_version.py b/tests_v2/unit/app/routes/datasets/test_version.py index 19f5b33bf..8d4b570c6 100644 --- a/tests_v2/unit/app/routes/datasets/test_version.py +++ b/tests_v2/unit/app/routes/datasets/test_version.py @@ -1,6 +1,7 @@ import pytest from _pytest.monkeypatch import MonkeyPatch from httpx import AsyncClient +import re from app.routes.datasets import versions from app.tasks import batch @@ -17,6 +18,14 @@ async def test_get_version(async_client: AsyncClient, generic_vector_source_vers assert resp.status_code == 200 data = resp.json() assert_jsend(data) + first_asset = data["data"]["assets"][0] + assert len(first_asset) == 3 + assert first_asset[0] == "Geo database table" + # Check asset_id looks reasonable + asset_id = first_asset[2] + assert len(asset_id) > 30 + pattern = re.compile(r'^[a-zA-Z0-9-]+$') + assert bool(pattern.match(asset_id)) @pytest.mark.asyncio From 09d369cc8a753ebb24875b5fbac2490d7232510d Mon Sep 17 00:00:00 2001 From: Daniel Mannarino Date: Fri, 8 Mar 2024 17:08:59 -0500 Subject: [PATCH 20/44] Remove GHA special case for defunct production workspace --- .github/workflows/terraform_destroy_on_delete.yaml | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/.github/workflows/terraform_destroy_on_delete.yaml b/.github/workflows/terraform_destroy_on_delete.yaml index d814e8a70..8f948cb0a 100644 --- a/.github/workflows/terraform_destroy_on_delete.yaml +++ b/.github/workflows/terraform_destroy_on_delete.yaml @@ -15,16 +15,3 @@ jobs: AWS_SECRET_ACCESS_KEY: ${{ secrets.aws_secret_dev }} AWS_REGION: ${{ secrets.aws_region_dev }} run: ./scripts/delete_workspace -w ${{ github.event.ref }} -g "no_sha_available" - - build-temp: - if: contains(github.event.ref_type, 'branch') && github.event.ref == 'merge-from-master' - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v1 - - name: Destroy state and delete workspace - env: - ENV: production - AWS_ACCESS_KEY_ID: ${{ secrets.aws_key_production }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.aws_secret_production }} - AWS_REGION: ${{ secrets.aws_region_production }} - run: ./scripts/delete_workspace -w ${{ github.event.ref }} -g "no_sha_available" From 5f454aee5dff89cc8cf3dcd939d8ac312da582a7 Mon Sep 17 00:00:00 2001 From: Justin Terry Date: Wed, 27 Mar 2024 11:18:53 -0700 Subject: [PATCH 21/44] Add logic to properly update bands metadataa --- .isort.cfg | 2 +- app/crud/metadata.py | 22 ++++++++++++++++++++++ 2 files changed, 23 insertions(+), 1 deletion(-) diff --git a/.isort.cfg b/.isort.cfg index 689f896bf..9c838702b 100644 --- a/.isort.cfg +++ b/.isort.cfg @@ -2,4 +2,4 @@ line_length = 88 multi_line_output = 3 include_trailing_comma = True -known_third_party = _pytest,aenum,affine,aiohttp,alembic,async_lru,asyncpg,aws_utils,boto3,botocore,click,docker,errors,fastapi,fiona,gdal_utils,geoalchemy2,geojson,gfw_pixetl,gino,gino_starlette,google,httpx,httpx_auth,logger,logging_utils,moto,numpy,orjson,osgeo,pandas,pendulum,pglast,psutil,psycopg2,pydantic,pyproj,pytest,pytest_asyncio,rasterio,retrying,shapely,sqlalchemy,sqlalchemy_utils,starlette,tileputty,typer +known_third_party = _pytest,aenum,affine,aiohttp,alembic,async_lru,asyncpg,aws_utils,boto3,botocore,click,docker,errors,fastapi,fiona,gdal_utils,geoalchemy2,geojson,gfw_pixetl,gino,gino_starlette,google,httpx,httpx_auth,logger,logging_utils,moto,numpy,orjson,osgeo,pandas,pendulum,pglast,psutil,psycopg2,pydantic,pyproj,pytest,pytest_asyncio,rasterio,shapely,sqlalchemy,sqlalchemy_utils,starlette,tileputty,typer diff --git a/app/crud/metadata.py b/app/crud/metadata.py index d05976744..048fd4bff 100644 --- a/app/crud/metadata.py +++ b/app/crud/metadata.py @@ -181,12 +181,26 @@ async def get_asset_metadata(asset_id: UUID): async def update_asset_metadata(asset_id: UUID, **data) -> ORMAssetMetadata: """Update asset metadata.""" fields = data.pop("fields", None) + bands = data.pop("bands", None) asset_metadata: ORMAssetMetadata = await get_asset_metadata(asset_id) if data: await asset_metadata.update(**data).apply() + bands_metadata = [] + if bands: + for band in bands: + try: + band_metadata = await update_band_metadata(asset_metadata.id, **band) + except RecordNotFoundError: + band_metadata = await create_raster_band_metadata( + asset_metadata.id, **band + ) + bands_metadata.append(band_metadata) + + asset_metadata.bands = bands_metadata + fields_metadata = [] if fields: for field in fields: @@ -240,6 +254,14 @@ async def update_field_metadata( return field_metadata +async def update_band_metadata(metadata_id: UUID, **data) -> ORMFieldMetadata: + band_metadata: ORMRasterBandMetadata = await ORMRasterBandMetadata(metadata_id) + + await band_metadata.update(**data).apply() + + return band_metadata + + async def get_asset_fields(asset_metadata_id: UUID) -> List[ORMFieldMetadata]: fields_metadata: List[ORMFieldMetadata] = await ( ORMFieldMetadata.query.where( From 26f0607691a7263e122f0c9936b5ae4d96023135 Mon Sep 17 00:00:00 2001 From: Justin Terry Date: Wed, 27 Mar 2024 16:50:34 -0700 Subject: [PATCH 22/44] Fixed based on tests --- app/crud/metadata.py | 28 ++++++++-------- tests/crud/test_assets.py | 67 ++++++++++++++++++++++++++++++++++++--- tests/utils.py | 14 ++++++++ 3 files changed, 92 insertions(+), 17 deletions(-) diff --git a/app/crud/metadata.py b/app/crud/metadata.py index 048fd4bff..e88db2f46 100644 --- a/app/crud/metadata.py +++ b/app/crud/metadata.py @@ -188,16 +188,13 @@ async def update_asset_metadata(asset_id: UUID, **data) -> ORMAssetMetadata: if data: await asset_metadata.update(**data).apply() - bands_metadata = [] if bands: - for band in bands: - try: - band_metadata = await update_band_metadata(asset_metadata.id, **band) - except RecordNotFoundError: - band_metadata = await create_raster_band_metadata( - asset_metadata.id, **band - ) - bands_metadata.append(band_metadata) + try: + bands_metadata = await update_band_metadata(asset_metadata.id, bands) + except RecordNotFoundError: + bands_metadata = await create_raster_band_metadata( + asset_metadata.id, **bands + ) asset_metadata.bands = bands_metadata @@ -254,12 +251,17 @@ async def update_field_metadata( return field_metadata -async def update_band_metadata(metadata_id: UUID, **data) -> ORMFieldMetadata: - band_metadata: ORMRasterBandMetadata = await ORMRasterBandMetadata(metadata_id) +async def update_band_metadata(metadata_id: UUID, bands) -> ORMFieldMetadata: + bands_metadata: List[ + ORMRasterBandMetadata + ] = await ORMRasterBandMetadata.query.where( + ORMRasterBandMetadata.asset_metadata_id == metadata_id + ).gino.all() - await band_metadata.update(**data).apply() + for band, band_metadata in zip(bands, bands_metadata): + await band_metadata.update(**band).apply() - return band_metadata + return bands_metadata async def get_asset_fields(asset_metadata_id: UUID) -> List[ORMFieldMetadata]: diff --git a/tests/crud/test_assets.py b/tests/crud/test_assets.py index c36340ea1..a9a5c4ad8 100644 --- a/tests/crud/test_assets.py +++ b/tests/crud/test_assets.py @@ -10,18 +10,22 @@ create_asset, delete_asset, get_asset, - get_assets, get_assets_by_filter, update_asset, ) from app.crud.datasets import create_dataset from app.crud.versions import create_version from app.errors import RecordAlreadyExistsError, RecordNotFoundError -from app.models.pydantic.change_log import ChangeLog from app.models.pydantic.asset_metadata import DatabaseTableMetadata -from app.models.pydantic.metadata import VersionMetadata, DatasetMetadata +from app.models.pydantic.change_log import ChangeLog +from app.models.pydantic.metadata import DatasetMetadata, VersionMetadata -from ..utils import dataset_metadata, version_metadata, asset_metadata +from ..utils import ( + asset_metadata, + dataset_metadata, + raster_asset_metadata, + version_metadata, +) @pytest.mark.asyncio @@ -243,3 +247,58 @@ async def test_assets_metadata(): assert ( asset.metadata.fields[0].data_type == asset_metadata["fields"][0]["data_type"] ) + + +@pytest.mark.asyncio +async def test_band_metadata(): + """Testing band metadata.""" + + dataset = "test" + version = "v1.1.1" + + # Add a dataset + async with ContextEngine("WRITE"): + await create_dataset( + dataset, metadata=DatasetMetadata(**dataset_metadata).dict(by_alias=False) + ) + await create_version( + dataset, + version, + metadata=VersionMetadata(**version_metadata).dict(by_alias=True), + ) + new_asset = await create_asset( + dataset, + version, + asset_type="Raster tile set", + asset_uri="s3://path/to/file", + metadata=raster_asset_metadata, + ) + + asset_id = new_asset.asset_id + assert ( + new_asset.metadata.bands[0].values_table + == raster_asset_metadata["bands"][0]["values_table"] + ) + + updated_band_metadata = { + "bands": [ + { + "pixel_meaning": "year", + "values_table": { + "rows": [ + {"value": 1, "meaning": 2001}, + {"value": 2, "meaning": 2002}, + {"value": 3, "meaning": 2003}, + ], + }, + } + ] + } + + async with ContextEngine("WRITE"): + asset = await update_asset(asset_id, metadata=updated_band_metadata) + + assert ( + asset.metadata.bands[0].values_table + == updated_band_metadata["bands"][0]["values_table"] + ) diff --git a/tests/utils.py b/tests/utils.py index 307578ea4..c3dba73e6 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -39,6 +39,20 @@ "fields": [{"name": "field1", "data_type": "numeric", "unit": "meters"}] } +raster_asset_metadata = { + "bands": [ + { + "pixel_meaning": "year", + "values_table": { + "rows": [ + {"value": 1, "meaning": 2001}, + {"value": 2, "meaning": 2002}, + ], + }, + } + ] +} + generic_version_payload = { "metadata": version_metadata, "creation_options": { From 09e00bd2970bb9be7e719c89f54c47471f7fdc10 Mon Sep 17 00:00:00 2001 From: Justin Terry Date: Wed, 27 Mar 2024 16:52:44 -0700 Subject: [PATCH 23/44] Remove unintended change --- .isort.cfg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.isort.cfg b/.isort.cfg index 9c838702b..3649e86b3 100644 --- a/.isort.cfg +++ b/.isort.cfg @@ -2,4 +2,4 @@ line_length = 88 multi_line_output = 3 include_trailing_comma = True -known_third_party = _pytest,aenum,affine,aiohttp,alembic,async_lru,asyncpg,aws_utils,boto3,botocore,click,docker,errors,fastapi,fiona,gdal_utils,geoalchemy2,geojson,gfw_pixetl,gino,gino_starlette,google,httpx,httpx_auth,logger,logging_utils,moto,numpy,orjson,osgeo,pandas,pendulum,pglast,psutil,psycopg2,pydantic,pyproj,pytest,pytest_asyncio,rasterio,shapely,sqlalchemy,sqlalchemy_utils,starlette,tileputty,typer +known_third_party = _pytest,aenum,affine,aiohttp,alembic,async_lru,asyncpg,aws_utils,boto3,botocore,click,docker,errors,fastapi,fiona,gdal_utils,geoalchemy2,geojson,gfw_pixetl,gino,gino_starlette,google,httpx,httpx_auth,logger,logging_utils,moto,numpy,orjson,osgeo,pandas,pendulum,pglast,psutil,psycopg2,pydantic,pyproj,pytest,pytest_asyncio,rasterio,retrying,shapely,sqlalchemy,sqlalchemy_utils,starlette,tileputty,typer \ No newline at end of file From 9ff7e4e4f917647ad3ccd32d2c0872a891deac67 Mon Sep 17 00:00:00 2001 From: Justin Terry Date: Wed, 27 Mar 2024 16:53:34 -0700 Subject: [PATCH 24/44] Remove unintended change --- .isort.cfg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.isort.cfg b/.isort.cfg index 3649e86b3..689f896bf 100644 --- a/.isort.cfg +++ b/.isort.cfg @@ -2,4 +2,4 @@ line_length = 88 multi_line_output = 3 include_trailing_comma = True -known_third_party = _pytest,aenum,affine,aiohttp,alembic,async_lru,asyncpg,aws_utils,boto3,botocore,click,docker,errors,fastapi,fiona,gdal_utils,geoalchemy2,geojson,gfw_pixetl,gino,gino_starlette,google,httpx,httpx_auth,logger,logging_utils,moto,numpy,orjson,osgeo,pandas,pendulum,pglast,psutil,psycopg2,pydantic,pyproj,pytest,pytest_asyncio,rasterio,retrying,shapely,sqlalchemy,sqlalchemy_utils,starlette,tileputty,typer \ No newline at end of file +known_third_party = _pytest,aenum,affine,aiohttp,alembic,async_lru,asyncpg,aws_utils,boto3,botocore,click,docker,errors,fastapi,fiona,gdal_utils,geoalchemy2,geojson,gfw_pixetl,gino,gino_starlette,google,httpx,httpx_auth,logger,logging_utils,moto,numpy,orjson,osgeo,pandas,pendulum,pglast,psutil,psycopg2,pydantic,pyproj,pytest,pytest_asyncio,rasterio,retrying,shapely,sqlalchemy,sqlalchemy_utils,starlette,tileputty,typer From 6c3fbb14e43933e098789e4a24743468f3c383c1 Mon Sep 17 00:00:00 2001 From: Justin Terry Date: Wed, 27 Mar 2024 17:04:31 -0700 Subject: [PATCH 25/44] Throw error for invalid update request --- app/crud/metadata.py | 7 ++++++- app/routes/assets/asset.py | 14 ++++++++------ 2 files changed, 14 insertions(+), 7 deletions(-) diff --git a/app/crud/metadata.py b/app/crud/metadata.py index e88db2f46..cef5955a3 100644 --- a/app/crud/metadata.py +++ b/app/crud/metadata.py @@ -4,7 +4,7 @@ from asyncpg import UniqueViolationError -from ..errors import RecordAlreadyExistsError, RecordNotFoundError +from ..errors import BadRequestError, RecordAlreadyExistsError, RecordNotFoundError from ..models.orm.asset_metadata import AssetMetadata as ORMAssetMetadata from ..models.orm.asset_metadata import FieldMetadata as ORMFieldMetadata from ..models.orm.asset_metadata import RasterBandMetadata as ORMRasterBandMetadata @@ -258,6 +258,11 @@ async def update_band_metadata(metadata_id: UUID, bands) -> ORMFieldMetadata: ORMRasterBandMetadata.asset_metadata_id == metadata_id ).gino.all() + if len(bands) != len(bands_metadata): + raise BadRequestError( + f"Update request must include metadata for {len(bands)} bands." + ) + for band, band_metadata in zip(bands, bands_metadata): await band_metadata.update(**band).apply() diff --git a/app/routes/assets/asset.py b/app/routes/assets/asset.py index e408d639f..0d39879be 100644 --- a/app/routes/assets/asset.py +++ b/app/routes/assets/asset.py @@ -11,11 +11,6 @@ from typing import List, Optional, Union from uuid import UUID -# from fastapi import APIRouter, BackgroundTasks, Depends, HTTPException, Path, status -from fastapi.responses import ORJSONResponse -from starlette.responses import JSONResponse - -from app.models.pydantic.responses import Response from fastapi import ( APIRouter, BackgroundTasks, @@ -27,13 +22,18 @@ status, ) +# from fastapi import APIRouter, BackgroundTasks, Depends, HTTPException, Path, status +from fastapi.responses import ORJSONResponse +from starlette.responses import JSONResponse + +from app.models.pydantic.responses import Response from app.settings.globals import API_URL from ...authentication.token import is_admin from ...crud import assets from ...crud import metadata as metadata_crud from ...crud import tasks -from ...errors import RecordAlreadyExistsError, RecordNotFoundError +from ...errors import BadRequestError, RecordAlreadyExistsError, RecordNotFoundError from ...models.enum.assets import is_database_asset, is_single_file_asset from ...models.orm.asset_metadata import FieldMetadata as ORMFieldMetadata from ...models.orm.assets import Asset as ORMAsset @@ -112,6 +112,8 @@ async def update_asset( row: ORMAsset = await assets.update_asset(asset_id, **input_data) except RecordNotFoundError as e: raise HTTPException(status_code=404, detail=str(e)) + except BadRequestError as e: + raise HTTPException(status_code=400, detail=str(e)) except NotImplementedError as e: raise HTTPException(status_code=501, detail=str(e)) From 2122dfdbe8a5f5df23022e098b0b4b5aacec0b4a Mon Sep 17 00:00:00 2001 From: Justin Terry Date: Wed, 27 Mar 2024 17:12:15 -0700 Subject: [PATCH 26/44] Add documentation --- app/routes/assets/asset.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/app/routes/assets/asset.py b/app/routes/assets/asset.py index 0d39879be..7e6b864d0 100644 --- a/app/routes/assets/asset.py +++ b/app/routes/assets/asset.py @@ -104,7 +104,11 @@ async def update_asset( request: AssetUpdateIn, is_authorized: bool = Depends(is_admin), ) -> AssetResponse: - """Update Asset metadata.""" + """Update Asset metadata. + + For raster band metadata, raster band updates must include all bands + in the correct order. If no updated is needed for a band, pass `{}`. + """ input_data = request.dict(exclude_none=True, by_alias=True) From 96463debf9ef56f7ffb57fb6399384fecb3d3262 Mon Sep 17 00:00:00 2001 From: Justin Terry Date: Fri, 29 Mar 2024 13:49:47 -0700 Subject: [PATCH 27/44] Use DB model correctly --- .isort.cfg | 2 +- app/crud/metadata.py | 72 ++++++++++++++++++++++---------- app/models/orm/asset_metadata.py | 2 +- app/routes/assets/asset.py | 6 +-- 4 files changed, 54 insertions(+), 28 deletions(-) diff --git a/.isort.cfg b/.isort.cfg index 689f896bf..9c838702b 100644 --- a/.isort.cfg +++ b/.isort.cfg @@ -2,4 +2,4 @@ line_length = 88 multi_line_output = 3 include_trailing_comma = True -known_third_party = _pytest,aenum,affine,aiohttp,alembic,async_lru,asyncpg,aws_utils,boto3,botocore,click,docker,errors,fastapi,fiona,gdal_utils,geoalchemy2,geojson,gfw_pixetl,gino,gino_starlette,google,httpx,httpx_auth,logger,logging_utils,moto,numpy,orjson,osgeo,pandas,pendulum,pglast,psutil,psycopg2,pydantic,pyproj,pytest,pytest_asyncio,rasterio,retrying,shapely,sqlalchemy,sqlalchemy_utils,starlette,tileputty,typer +known_third_party = _pytest,aenum,affine,aiohttp,alembic,async_lru,asyncpg,aws_utils,boto3,botocore,click,docker,errors,fastapi,fiona,gdal_utils,geoalchemy2,geojson,gfw_pixetl,gino,gino_starlette,google,httpx,httpx_auth,logger,logging_utils,moto,numpy,orjson,osgeo,pandas,pendulum,pglast,psutil,psycopg2,pydantic,pyproj,pytest,pytest_asyncio,rasterio,shapely,sqlalchemy,sqlalchemy_utils,starlette,tileputty,typer diff --git a/app/crud/metadata.py b/app/crud/metadata.py index cef5955a3..8765d0746 100644 --- a/app/crud/metadata.py +++ b/app/crud/metadata.py @@ -4,7 +4,7 @@ from asyncpg import UniqueViolationError -from ..errors import BadRequestError, RecordAlreadyExistsError, RecordNotFoundError +from ..errors import RecordAlreadyExistsError, RecordNotFoundError from ..models.orm.asset_metadata import AssetMetadata as ORMAssetMetadata from ..models.orm.asset_metadata import FieldMetadata as ORMFieldMetadata from ..models.orm.asset_metadata import RasterBandMetadata as ORMRasterBandMetadata @@ -188,13 +188,19 @@ async def update_asset_metadata(asset_id: UUID, **data) -> ORMAssetMetadata: if data: await asset_metadata.update(**data).apply() + bands_metadata = [] if bands: - try: - bands_metadata = await update_band_metadata(asset_metadata.id, bands) - except RecordNotFoundError: - bands_metadata = await create_raster_band_metadata( - asset_metadata.id, **bands - ) + for band in bands: + try: + pixel_meaning = band.pop("pixel_meaning") + band_metadata = await update_band_metadata( + asset_metadata.id, pixel_meaning, **band + ) + except RecordNotFoundError: + bands_metadata = await create_raster_band_metadata( + asset_metadata.id, **bands + ) + bands_metadata.append(band_metadata) asset_metadata.bands = bands_metadata @@ -213,6 +219,21 @@ async def update_asset_metadata(asset_id: UUID, **data) -> ORMAssetMetadata: return asset_metadata + fields_metadata = [] + if fields: + for field in fields: + try: + field_metadata = await update_field_metadata( + asset_metadata.id, field["name"], **field + ) + except RecordNotFoundError: + field_metadata = await create_field_metadata(asset_metadata.id, **field) + fields_metadata.append(field_metadata) + + asset_metadata.fields = fields_metadata + + return asset_metadata + async def delete_asset_metadata(asset_id: UUID) -> ORMAssetMetadata: asset_metadata: ORMAssetMetadata = await get_asset_metadata(asset_id) @@ -251,22 +272,16 @@ async def update_field_metadata( return field_metadata -async def update_band_metadata(metadata_id: UUID, bands) -> ORMFieldMetadata: - bands_metadata: List[ - ORMRasterBandMetadata - ] = await ORMRasterBandMetadata.query.where( - ORMRasterBandMetadata.asset_metadata_id == metadata_id - ).gino.all() - - if len(bands) != len(bands_metadata): - raise BadRequestError( - f"Update request must include metadata for {len(bands)} bands." - ) +async def update_band_metadata( + metadata_id: UUID, pixel_meaning: str, **data +) -> ORMFieldMetadata: + band_metadata: ORMRasterBandMetadata = await get_asset_raster_band( + metadata_id, pixel_meaning + ) - for band, band_metadata in zip(bands, bands_metadata): - await band_metadata.update(**band).apply() + await band_metadata.update(**data).apply() - return bands_metadata + return band_metadata async def get_asset_fields(asset_metadata_id: UUID) -> List[ORMFieldMetadata]: @@ -301,6 +316,21 @@ async def get_asset_field(asset_metadata_id: UUID, field_name: str) -> ORMFieldM return field_metadata +async def get_asset_raster_band( + asset_metadata_id: UUID, pixel_meaning: str +) -> ORMRasterBandMetadata: + band_metadata: ORMRasterBandMetadata = await ORMRasterBandMetadata.get( + [asset_metadata_id, pixel_meaning] + ) + + if band_metadata is None: + raise RecordNotFoundError( + f"No band metadata record found for pixel meaning {pixel_meaning}." + ) + + return band_metadata + + def update_metadata(row: Base, parent: Base): """Dynamically update metadata with parent metadata. diff --git a/app/models/orm/asset_metadata.py b/app/models/orm/asset_metadata.py index 7b031ba40..4b51c77b9 100644 --- a/app/models/orm/asset_metadata.py +++ b/app/models/orm/asset_metadata.py @@ -52,8 +52,8 @@ class RasterBandMetadata(db.Model): name="asset_metadata_id_fk", onupdate="CASCADE", ondelete="CASCADE", - primary_key=True, ), + primary_key=True, ) pixel_meaning = db.Column(db.String, primary_key=True) description = db.Column(db.String) diff --git a/app/routes/assets/asset.py b/app/routes/assets/asset.py index 7e6b864d0..0d39879be 100644 --- a/app/routes/assets/asset.py +++ b/app/routes/assets/asset.py @@ -104,11 +104,7 @@ async def update_asset( request: AssetUpdateIn, is_authorized: bool = Depends(is_admin), ) -> AssetResponse: - """Update Asset metadata. - - For raster band metadata, raster band updates must include all bands - in the correct order. If no updated is needed for a band, pass `{}`. - """ + """Update Asset metadata.""" input_data = request.dict(exclude_none=True, by_alias=True) From 374705feffc77752221c77d38dcc97a2e8d672eb Mon Sep 17 00:00:00 2001 From: Dan Scales Date: Wed, 27 Mar 2024 12:49:44 -0700 Subject: [PATCH 28/44] GTC-2449 Catch access denied error in get_aws_files() Currently, we are only catching NoSuchBucket errors, but there can also be AccessDenied errors. If an 'access denied' happens, we get an uncaught exception, which gives an internal server error. Strangely, s3_client.exceptions doesn't have AccessDenied, so we just catch all botocore exceptions (which is probably what we should do anyway). I couldn't create any new test to emulate access denied, so will just test it out in staging. --- app/routes/datasets/versions.py | 2 +- app/utils/aws.py | 8 +++++++- tests/routes/datasets/test_versions.py | 4 ++-- 3 files changed, 10 insertions(+), 4 deletions(-) mode change 100644 => 100755 tests/routes/datasets/test_versions.py diff --git a/app/routes/datasets/versions.py b/app/routes/datasets/versions.py index ae4696138..50cdf0356 100644 --- a/app/routes/datasets/versions.py +++ b/app/routes/datasets/versions.py @@ -560,7 +560,7 @@ def _verify_source_file_access(sources: List[str]) -> None: raise HTTPException( status_code=400, detail=( - "Cannot access all of the source files. " + "Cannot access all of the source files (non-existent or access denied). " f"Invalid sources: {invalid_sources}" ), ) diff --git a/app/utils/aws.py b/app/utils/aws.py index fe981be42..2f9e2da66 100644 --- a/app/utils/aws.py +++ b/app/utils/aws.py @@ -1,8 +1,10 @@ from typing import Any, Dict, List, Optional, Sequence import boto3 +import botocore import httpx from httpx_auth import AWS4Auth +from fastapi.logger import logger from ..settings.globals import ( AWS_REGION, @@ -108,7 +110,11 @@ def get_aws_files( if exit_after_max and num_matches >= exit_after_max: break - except s3_client.exceptions.NoSuchBucket: + # Strangely, s3_client.exceptions has NoSuchBucket, but doesn't have + # AccessDenied, even though you can get that error, so we just catch all botocore + # exceptions. + except botocore.exceptions.ClientError as error: + logger.warning(f"get_aws_file: {error}") matches = list() return matches diff --git a/tests/routes/datasets/test_versions.py b/tests/routes/datasets/test_versions.py old mode 100644 new mode 100755 index 492903d75..e1cc6a1aa --- a/tests/routes/datasets/test_versions.py +++ b/tests/routes/datasets/test_versions.py @@ -310,7 +310,7 @@ async def test_invalid_source_uri(async_client: AsyncClient): assert response.json()["status"] == "failed" assert ( response.json()["message"] - == f"Cannot access all of the source files. Invalid sources: ['{bad_uri}']" + == f"Cannot access all of the source files (non-existent or access denied). Invalid sources: ['{bad_uri}']" ) # Create a version with a valid source_uri so we have something to append to @@ -332,7 +332,7 @@ async def test_invalid_source_uri(async_client: AsyncClient): assert response.json()["status"] == "failed" assert ( response.json()["message"] - == f"Cannot access all of the source files. Invalid sources: ['{bad_uri}']" + == f"Cannot access all of the source files (non-existent or access denied). Invalid sources: ['{bad_uri}']" ) # Test appending to a version that DOESN'T exist From a91da8c177fac1d8b186131ed51a3a5566e2ba82 Mon Sep 17 00:00:00 2001 From: Daniel Mannarino Date: Wed, 3 Apr 2024 14:25:39 -0400 Subject: [PATCH 29/44] Cache _get_data_environment function --- app/routes/datasets/queries.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/app/routes/datasets/queries.py b/app/routes/datasets/queries.py index 5a43fb307..4e39c107c 100755 --- a/app/routes/datasets/queries.py +++ b/app/routes/datasets/queries.py @@ -7,13 +7,13 @@ from uuid import UUID, uuid4 import httpx +from async_lru import alru_cache from asyncpg import DataError, InsufficientPrivilegeError, SyntaxOrAccessError from fastapi import APIRouter, Depends, HTTPException, Query from fastapi import Request as FastApiRequest from fastapi import Response as FastApiResponse from fastapi.encoders import jsonable_encoder from fastapi.logger import logger - # from fastapi.openapi.models import APIKey from fastapi.responses import RedirectResponse from pglast import printers # noqa @@ -21,11 +21,9 @@ from pglast.parser import ParseError from pglast.printer import RawStream from pydantic.tools import parse_obj_as -from sqlalchemy.sql import and_ from ...authentication.token import is_gfwpro_admin_for_query from ...application import db - # from ...authentication.api_keys import get_api_key from ...crud import assets from ...models.enum.assets import AssetType @@ -62,7 +60,6 @@ from ...models.enum.queries import QueryFormat, QueryType from ...models.orm.assets import Asset as AssetORM from ...models.orm.queries.raster_assets import latest_raster_tile_sets -from ...models.orm.versions import Version as VersionORM from ...models.pydantic.asset_metadata import RasterTable, RasterTableRow from ...models.pydantic.creation_options import NoDataType from ...models.pydantic.geostore import Geometry, GeostoreCommon @@ -659,13 +656,14 @@ async def _query_raster_lambda( def _get_area_density_name(nm): - """Return empty string if nm doesn't not have an area-density suffix, else + """Return empty string if nm doesn't have an area-density suffix, else return nm with the area-density suffix removed.""" for suffix in AREA_DENSITY_RASTER_SUFFIXES: if nm.endswith(suffix): return nm[:-len(suffix)] return "" + def _get_default_layer(dataset, pixel_meaning): default_type = pixel_meaning area_density_name = _get_area_density_name(default_type) @@ -683,6 +681,7 @@ def _get_default_layer(dataset, pixel_meaning): return f"{dataset}__{default_type}" +@alru_cache(maxsize=16, ttl=300.0) async def _get_data_environment(grid: Grid) -> DataEnvironment: # get all raster tile set assets with the same grid. latest_tile_sets = await db.all(latest_raster_tile_sets, {"grid": grid}) From bc904576805a6e581f1f52e74ad9638aad524ea7 Mon Sep 17 00:00:00 2001 From: Daniel Mannarino Date: Thu, 4 Apr 2024 16:23:17 -0400 Subject: [PATCH 30/44] Make a per-function async test client to avoid caching problems during testing --- tests_v2/conftest.py | 6 ++++++ tests_v2/unit/app/routes/analysis/test_analysis.py | 5 ++++- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/tests_v2/conftest.py b/tests_v2/conftest.py index 33fbb3a53..9bb49cb8f 100755 --- a/tests_v2/conftest.py +++ b/tests_v2/conftest.py @@ -93,6 +93,12 @@ async def async_client(db, init_db) -> AsyncGenerator[AsyncClient, None]: app.dependency_overrides = {} +@pytest_asyncio.fixture(scope="function") +async def async_client_per_function(async_client) -> AsyncGenerator[AsyncClient, None]: + """Async Test Client that's limited to per-function scope.""" + yield async_client + + @pytest_asyncio.fixture async def async_client_unauthenticated( db, init_db diff --git a/tests_v2/unit/app/routes/analysis/test_analysis.py b/tests_v2/unit/app/routes/analysis/test_analysis.py index d8dcd6673..5d55a0f72 100644 --- a/tests_v2/unit/app/routes/analysis/test_analysis.py +++ b/tests_v2/unit/app/routes/analysis/test_analysis.py @@ -79,8 +79,11 @@ async def test_analysis_with_huge_geostore( @pytest.mark.asyncio async def test_raster_analysis_payload_shape( - generic_dataset, async_client: AsyncClient, monkeypatch: MonkeyPatch + generic_dataset, async_client_per_function: AsyncClient, monkeypatch: MonkeyPatch ): + """Note that we use the async_client_per_function fixture to avoid + the cache getting polluted from other tests""" + dataset_name, _ = generic_dataset pixel_meaning: str = "date_conf" no_data_value = 0 From 824be2897efe39ae8e117aabb00e6a267612088b Mon Sep 17 00:00:00 2001 From: Daniel Mannarino Date: Thu, 4 Apr 2024 21:55:53 -0400 Subject: [PATCH 31/44] I forgot to pass the perf-function test client. Fix that --- tests_v2/unit/app/routes/analysis/test_analysis.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests_v2/unit/app/routes/analysis/test_analysis.py b/tests_v2/unit/app/routes/analysis/test_analysis.py index 5d55a0f72..7536be157 100644 --- a/tests_v2/unit/app/routes/analysis/test_analysis.py +++ b/tests_v2/unit/app/routes/analysis/test_analysis.py @@ -89,7 +89,7 @@ async def test_raster_analysis_payload_shape( no_data_value = 0 async with custom_raster_version( - async_client, + async_client_per_function, dataset_name, monkeypatch, pixel_meaning=pixel_meaning, @@ -106,7 +106,7 @@ async def test_raster_analysis_payload_shape( ) monkeypatch.setattr(geostore.rw_api, "get_geostore", mock_rw_get_geostore) - _ = await async_client.get( + _ = await async_client_per_function.get( f"/analysis/zonal/17076d5ea9f214a5bdb68cc40433addb?geostore_origin=rw&group_by=umd_tree_cover_loss__year&filters=is__umd_regional_primary_forest_2001&filters=umd_tree_cover_density_2000__30&sum=area__ha&start_date=2001" ) payload = mock_invoke_lambda.call_args.args[1] From a5ca0706265b7b0c7735dde902dca3f982d06b31 Mon Sep 17 00:00:00 2001 From: Daniel Mannarino Date: Fri, 5 Apr 2024 09:47:30 -0400 Subject: [PATCH 32/44] Reluctantly duplicate async_client fixture for per-function version --- tests_v2/conftest.py | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/tests_v2/conftest.py b/tests_v2/conftest.py index 9bb49cb8f..c10b1985d 100755 --- a/tests_v2/conftest.py +++ b/tests_v2/conftest.py @@ -96,7 +96,25 @@ async def async_client(db, init_db) -> AsyncGenerator[AsyncClient, None]: @pytest_asyncio.fixture(scope="function") async def async_client_per_function(async_client) -> AsyncGenerator[AsyncClient, None]: """Async Test Client that's limited to per-function scope.""" - yield async_client + from app.main import app + + # mock authentication function to avoid having to reach out to RW API during tests + app.dependency_overrides[is_admin] = bool_function_closure(True, with_args=False) + app.dependency_overrides[is_service_account] = bool_function_closure( + True, with_args=False + ) + app.dependency_overrides[get_user] = get_admin_mocked + + async with AsyncClient( + app=app, + base_url="http://test", + trust_env=False, + headers={"Origin": "https://www.globalforestwatch.org"}, + ) as client: + yield client + + # Clean up + app.dependency_overrides = {} @pytest_asyncio.fixture From 9c9e7a7725955d13233e81986832a88b90792443 Mon Sep 17 00:00:00 2001 From: Daniel Mannarino Date: Fri, 5 Apr 2024 09:59:08 -0400 Subject: [PATCH 33/44] Fix fixture deps for async_client_per_function --- tests_v2/conftest.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tests_v2/conftest.py b/tests_v2/conftest.py index c10b1985d..5051276ec 100755 --- a/tests_v2/conftest.py +++ b/tests_v2/conftest.py @@ -94,8 +94,10 @@ async def async_client(db, init_db) -> AsyncGenerator[AsyncClient, None]: @pytest_asyncio.fixture(scope="function") -async def async_client_per_function(async_client) -> AsyncGenerator[AsyncClient, None]: - """Async Test Client that's limited to per-function scope.""" +async def async_client_per_function(db, init_db) -> AsyncGenerator[AsyncClient, None]: + """Async Test Client that's limited to per-function scope. + Use sparingly (such as when you need to clear caches) to avoid lengthier tests + """ from app.main import app # mock authentication function to avoid having to reach out to RW API during tests From a7568693b5988bb6fd7b12489fd47c043a0a4bff Mon Sep 17 00:00:00 2001 From: Daniel Mannarino Date: Mon, 15 Apr 2024 15:12:21 -0400 Subject: [PATCH 34/44] Use a feature of async_lru to clear the cache in test --- tests_v2/unit/app/routes/analysis/test_analysis.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/tests_v2/unit/app/routes/analysis/test_analysis.py b/tests_v2/unit/app/routes/analysis/test_analysis.py index 7536be157..547a7d37d 100644 --- a/tests_v2/unit/app/routes/analysis/test_analysis.py +++ b/tests_v2/unit/app/routes/analysis/test_analysis.py @@ -81,8 +81,8 @@ async def test_analysis_with_huge_geostore( async def test_raster_analysis_payload_shape( generic_dataset, async_client_per_function: AsyncClient, monkeypatch: MonkeyPatch ): - """Note that we use the async_client_per_function fixture to avoid - the cache getting polluted from other tests""" + """Note that this test depends on the output of _get_data_environment + which will likely have cached values from other tests, so we clear it.""" dataset_name, _ = generic_dataset pixel_meaning: str = "date_conf" @@ -106,6 +106,9 @@ async def test_raster_analysis_payload_shape( ) monkeypatch.setattr(geostore.rw_api, "get_geostore", mock_rw_get_geostore) + # The other tests will have polluted the data env cache. Clear it. + queries._get_data_environment.cache_clear() + _ = await async_client_per_function.get( f"/analysis/zonal/17076d5ea9f214a5bdb68cc40433addb?geostore_origin=rw&group_by=umd_tree_cover_loss__year&filters=is__umd_regional_primary_forest_2001&filters=umd_tree_cover_density_2000__30&sum=area__ha&start_date=2001" ) From 638a97c9cd81b8680ab0aa89983c1f9df1686609 Mon Sep 17 00:00:00 2001 From: Daniel Mannarino Date: Mon, 15 Apr 2024 15:13:33 -0400 Subject: [PATCH 35/44] Remove unneeded per-function async client fixture --- tests_v2/conftest.py | 26 ------------------- .../unit/app/routes/analysis/test_analysis.py | 2 +- 2 files changed, 1 insertion(+), 27 deletions(-) diff --git a/tests_v2/conftest.py b/tests_v2/conftest.py index 5051276ec..33fbb3a53 100755 --- a/tests_v2/conftest.py +++ b/tests_v2/conftest.py @@ -93,32 +93,6 @@ async def async_client(db, init_db) -> AsyncGenerator[AsyncClient, None]: app.dependency_overrides = {} -@pytest_asyncio.fixture(scope="function") -async def async_client_per_function(db, init_db) -> AsyncGenerator[AsyncClient, None]: - """Async Test Client that's limited to per-function scope. - Use sparingly (such as when you need to clear caches) to avoid lengthier tests - """ - from app.main import app - - # mock authentication function to avoid having to reach out to RW API during tests - app.dependency_overrides[is_admin] = bool_function_closure(True, with_args=False) - app.dependency_overrides[is_service_account] = bool_function_closure( - True, with_args=False - ) - app.dependency_overrides[get_user] = get_admin_mocked - - async with AsyncClient( - app=app, - base_url="http://test", - trust_env=False, - headers={"Origin": "https://www.globalforestwatch.org"}, - ) as client: - yield client - - # Clean up - app.dependency_overrides = {} - - @pytest_asyncio.fixture async def async_client_unauthenticated( db, init_db diff --git a/tests_v2/unit/app/routes/analysis/test_analysis.py b/tests_v2/unit/app/routes/analysis/test_analysis.py index 547a7d37d..0055e96e1 100644 --- a/tests_v2/unit/app/routes/analysis/test_analysis.py +++ b/tests_v2/unit/app/routes/analysis/test_analysis.py @@ -79,7 +79,7 @@ async def test_analysis_with_huge_geostore( @pytest.mark.asyncio async def test_raster_analysis_payload_shape( - generic_dataset, async_client_per_function: AsyncClient, monkeypatch: MonkeyPatch + generic_dataset, async_client: AsyncClient, monkeypatch: MonkeyPatch ): """Note that this test depends on the output of _get_data_environment which will likely have cached values from other tests, so we clear it.""" From 0b19c4691e29ab29a0a33d38e766cf7cf958430c Mon Sep 17 00:00:00 2001 From: Daniel Mannarino Date: Mon, 15 Apr 2024 15:45:25 -0400 Subject: [PATCH 36/44] Fix references to removed async_client_per_function --- tests_v2/unit/app/routes/analysis/test_analysis.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests_v2/unit/app/routes/analysis/test_analysis.py b/tests_v2/unit/app/routes/analysis/test_analysis.py index 0055e96e1..1def7fb39 100644 --- a/tests_v2/unit/app/routes/analysis/test_analysis.py +++ b/tests_v2/unit/app/routes/analysis/test_analysis.py @@ -89,7 +89,7 @@ async def test_raster_analysis_payload_shape( no_data_value = 0 async with custom_raster_version( - async_client_per_function, + async_client, dataset_name, monkeypatch, pixel_meaning=pixel_meaning, @@ -109,7 +109,7 @@ async def test_raster_analysis_payload_shape( # The other tests will have polluted the data env cache. Clear it. queries._get_data_environment.cache_clear() - _ = await async_client_per_function.get( + _ = await async_client.get( f"/analysis/zonal/17076d5ea9f214a5bdb68cc40433addb?geostore_origin=rw&group_by=umd_tree_cover_loss__year&filters=is__umd_regional_primary_forest_2001&filters=umd_tree_cover_density_2000__30&sum=area__ha&start_date=2001" ) payload = mock_invoke_lambda.call_args.args[1] From 24c383416097b77c48530eef1b68fdc57830d16a Mon Sep 17 00:00:00 2001 From: Dan Scales Date: Mon, 15 Apr 2024 09:04:59 -0700 Subject: [PATCH 37/44] GTC-2708 Expire all tile cache files when deleting a dataset version We currently delete all datalake files, but have not been deleting the tile cache files in s3://gfw-tiles. The expire_s3_objects() function seems to work, since it is already used when deleting only a static vector tile cache or a raster tile cache. Now that expire_s3_objects is being called when we delete a version, I needed to add in a monkeypatch for expire_s3_objects() in create_vector_source_version() and generic_raster_version() for the tests_v2 tests. --- app/tasks/delete_assets.py | 2 +- tests_v2/conftest.py | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/app/tasks/delete_assets.py b/app/tasks/delete_assets.py index 4582bae15..2528d1a0c 100644 --- a/app/tasks/delete_assets.py +++ b/app/tasks/delete_assets.py @@ -11,7 +11,7 @@ async def delete_all_assets(dataset: str, version: str) -> None: await delete_database_table_asset(dataset, version) delete_s3_objects(DATA_LAKE_BUCKET, f"{dataset}/{version}/") - # expire_s3_objects(TILE_CACHE_BUCKET, f"{dataset}/{version}/") + expire_s3_objects(TILE_CACHE_BUCKET, f"{dataset}/{version}/") flush_cloudfront_cache(TILE_CACHE_CLOUDFRONT_ID, [f"/{dataset}/{version}/*"]) diff --git a/tests_v2/conftest.py b/tests_v2/conftest.py index 33fbb3a53..99c052914 100755 --- a/tests_v2/conftest.py +++ b/tests_v2/conftest.py @@ -189,6 +189,7 @@ async def create_vector_source_version( monkeypatch.setattr(batch, "submit_batch_job", batch_job_mock.submit_batch_job) monkeypatch.setattr(vector_source_assets, "is_zipped", bool_function_closure(False)) monkeypatch.setattr(delete_assets, "delete_s3_objects", int_function_closure(1)) + monkeypatch.setattr(delete_assets, "expire_s3_objects", dict_function_closure({})) monkeypatch.setattr(versions, "flush_cloudfront_cache", dict_function_closure({})) monkeypatch.setattr( delete_assets, "flush_cloudfront_cache", dict_function_closure({}) @@ -248,6 +249,7 @@ async def generic_raster_version( monkeypatch.setattr(versions, "_verify_source_file_access", void_coroutine) monkeypatch.setattr(batch, "submit_batch_job", batch_job_mock.submit_batch_job) monkeypatch.setattr(delete_assets, "delete_s3_objects", int_function_closure(1)) + monkeypatch.setattr(delete_assets, "expire_s3_objects", dict_function_closure({})) monkeypatch.setattr(raster_tile_set_assets, "get_extent", get_extent_mocked) monkeypatch.setattr( delete_assets, "flush_cloudfront_cache", dict_function_closure({}) From c6b969d595d3eefc21d2e2382de4710b52d75b5c Mon Sep 17 00:00:00 2001 From: Solomon Negusse Date: Wed, 24 Apr 2024 09:21:06 +0300 Subject: [PATCH 38/44] update fargate module tag --- terraform/main.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/terraform/main.tf b/terraform/main.tf index 2ceb7950b..0a0bc164a 100644 --- a/terraform/main.tf +++ b/terraform/main.tf @@ -76,7 +76,7 @@ module "batch_tile_cache_image" { module "fargate_autoscaling" { - source = "git::https://github.com/wri/gfw-terraform-modules.git//terraform/modules/fargate_autoscaling?ref=single-dev-lb" + source = "git::https://github.com/wri/gfw-terraform-modules.git//terraform/modules/fargate_autoscaling?ref=v0.4.2.5" project = local.project name_suffix = local.name_suffix tags = local.fargate_tags From c152b4e76dc9c5f0cf54d92fdf6839938efde9e5 Mon Sep 17 00:00:00 2001 From: Daniel Mannarino Date: Thu, 25 Apr 2024 15:28:41 -0400 Subject: [PATCH 39/44] POSSIBLY enough to give the data API permission to expire tiles --- terraform/data.tf | 8 ++++++++ terraform/iam.tf | 5 +++++ terraform/main.tf | 1 + .../tile_cache_bucket_policy.json.tmpl | 20 +++++++++++++++++++ 4 files changed, 34 insertions(+) create mode 100644 terraform/templates/tile_cache_bucket_policy.json.tmpl diff --git a/terraform/data.tf b/terraform/data.tf index 6016f0118..5699f9013 100644 --- a/terraform/data.tf +++ b/terraform/data.tf @@ -175,3 +175,11 @@ data "external" "generate_port" { count = var.environment == "dev" ? 1 : 0 program = ["python3", "${path.module}/generate_port.py", local.name_suffix, "30000", "31000"] } + +data "template_file" "tile_cache_bucket_policy" { + template = file("${path.root}/templates/tile_cache_bucket_policy.json.tmpl") + + vars = { + bucket_name = data.terraform_remote_state.tile_cache.outputs.tile_cache_bucket_name + } +} \ No newline at end of file diff --git a/terraform/iam.tf b/terraform/iam.tf index 6fc0012fc..6d7691880 100644 --- a/terraform/iam.tf +++ b/terraform/iam.tf @@ -32,4 +32,9 @@ resource "aws_iam_policy" "read_gcs_secret" { resource "aws_iam_policy" "read_new_relic_secret" { name = substr("${local.project}-read_new-relic_secret${local.name_suffix}", 0, 64) policy = data.aws_iam_policy_document.read_new_relic_lic.json +} + +resource "aws_iam_policy" "tile_cache_bucket_policy" { + name = substr("${local.project}-tile_cache_bucket_policy${local.name_suffix}", 0, 64) + policy = data.template_file.tile_cache_bucket_policy.rendered } \ No newline at end of file diff --git a/terraform/main.tf b/terraform/main.tf index 0a0bc164a..cd6c8d736 100644 --- a/terraform/main.tf +++ b/terraform/main.tf @@ -106,6 +106,7 @@ module "fargate_autoscaling" { aws_iam_policy.read_gcs_secret.arn, data.terraform_remote_state.tile_cache.outputs.ecs_update_service_policy_arn, data.terraform_remote_state.tile_cache.outputs.tile_cache_bucket_full_access_policy_arn, + aws_iam_policy.tile_cache_bucket_policy.arn, data.terraform_remote_state.tile_cache.outputs.cloudfront_invalidation_policy_arn ] task_execution_role_policies = [ diff --git a/terraform/templates/tile_cache_bucket_policy.json.tmpl b/terraform/templates/tile_cache_bucket_policy.json.tmpl new file mode 100644 index 000000000..0a3b3376c --- /dev/null +++ b/terraform/templates/tile_cache_bucket_policy.json.tmpl @@ -0,0 +1,20 @@ +{ + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Action": [ + "s3:ListBucket", + "s3:PutLifecycleConfiguration" + ], + "Resource": "${bucket_name}" + }, + { + "Effect": "Allow", + "Action": "s3:*", + "Resource": [ + "${bucket_name}/*" + ] + } + ] +} \ No newline at end of file From 7fbf69b85a9803797939620dc2dd2311303bc8f0 Mon Sep 17 00:00:00 2001 From: Daniel Mannarino Date: Fri, 26 Apr 2024 00:19:33 -0400 Subject: [PATCH 40/44] Use (newly exported) tile cache bucket ARN --- terraform/data.tf | 2 +- terraform/templates/tile_cache_bucket_policy.json.tmpl | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/terraform/data.tf b/terraform/data.tf index 5699f9013..56b445327 100644 --- a/terraform/data.tf +++ b/terraform/data.tf @@ -180,6 +180,6 @@ data "template_file" "tile_cache_bucket_policy" { template = file("${path.root}/templates/tile_cache_bucket_policy.json.tmpl") vars = { - bucket_name = data.terraform_remote_state.tile_cache.outputs.tile_cache_bucket_name + bucket_arn = data.terraform_remote_state.tile_cache.outputs.tile_cache_bucket_arn } } \ No newline at end of file diff --git a/terraform/templates/tile_cache_bucket_policy.json.tmpl b/terraform/templates/tile_cache_bucket_policy.json.tmpl index 0a3b3376c..99c589848 100644 --- a/terraform/templates/tile_cache_bucket_policy.json.tmpl +++ b/terraform/templates/tile_cache_bucket_policy.json.tmpl @@ -7,13 +7,13 @@ "s3:ListBucket", "s3:PutLifecycleConfiguration" ], - "Resource": "${bucket_name}" + "Resource": "${bucket_arn}" }, { "Effect": "Allow", "Action": "s3:*", "Resource": [ - "${bucket_name}/*" + "${bucket_arn}/*" ] } ] From 442d8b6b50c7534c8b21a416748ab1534746f394 Mon Sep 17 00:00:00 2001 From: Daniel Mannarino Date: Fri, 26 Apr 2024 10:21:35 -0400 Subject: [PATCH 41/44] Remove now-redundant tile cache policy --- terraform/main.tf | 1 - 1 file changed, 1 deletion(-) diff --git a/terraform/main.tf b/terraform/main.tf index cd6c8d736..92b41c23f 100644 --- a/terraform/main.tf +++ b/terraform/main.tf @@ -105,7 +105,6 @@ module "fargate_autoscaling" { aws_iam_policy.iam_api_gateway_policy.arn, aws_iam_policy.read_gcs_secret.arn, data.terraform_remote_state.tile_cache.outputs.ecs_update_service_policy_arn, - data.terraform_remote_state.tile_cache.outputs.tile_cache_bucket_full_access_policy_arn, aws_iam_policy.tile_cache_bucket_policy.arn, data.terraform_remote_state.tile_cache.outputs.cloudfront_invalidation_policy_arn ] From 4b833517d330e616bc7647b33f3642d4b984deda Mon Sep 17 00:00:00 2001 From: Daniel Mannarino Date: Mon, 29 Apr 2024 09:20:52 -0400 Subject: [PATCH 42/44] Fix permission name per Dan's comment --- terraform/templates/tile_cache_bucket_policy.json.tmpl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/terraform/templates/tile_cache_bucket_policy.json.tmpl b/terraform/templates/tile_cache_bucket_policy.json.tmpl index 99c589848..20c1ad496 100644 --- a/terraform/templates/tile_cache_bucket_policy.json.tmpl +++ b/terraform/templates/tile_cache_bucket_policy.json.tmpl @@ -5,7 +5,7 @@ "Effect": "Allow", "Action": [ "s3:ListBucket", - "s3:PutLifecycleConfiguration" + "s3:PutBucketLifecycleConfiguration" ], "Resource": "${bucket_arn}" }, From e6433eb5a1a34e7d61717ed26925c8b1a1dcfa19 Mon Sep 17 00:00:00 2001 From: Dan Scales Date: Wed, 1 May 2024 15:08:49 -0700 Subject: [PATCH 43/44] GTC-2027 (part 2) - give useful error when input geometry is not Polygon/MultiPolygon This is to fix some more "tuple index out of range" 500 errors. My previous fix was for usual queries. This fix is for zonal statistics calls. (which have been deprecated for quite a while - should we just disable zonal statistics queries sometime soon?) --- app/routes/analysis/analysis.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/app/routes/analysis/analysis.py b/app/routes/analysis/analysis.py index 989f72fc1..c49c17c1f 100644 --- a/app/routes/analysis/analysis.py +++ b/app/routes/analysis/analysis.py @@ -101,6 +101,12 @@ async def _zonal_statistics( start_date: Optional[str], end_date: Optional[str], ): + if geometry.type != "Polygon" and geometry.type != "MultiPolygon": + raise HTTPException( + status_code=400, + detail=f"Geometry must be a Polygon or MultiPolygon for raster analysis" + ) + # OTF will just not apply a base filter base = "data" From 8acd9b594653fa8fd87dc1319d4ee088ed0cf501 Mon Sep 17 00:00:00 2001 From: Dan Scales Date: Tue, 7 May 2024 12:12:12 -0700 Subject: [PATCH 44/44] Disable expire_s3_objects call in delete_all_assets (used by delete_version) We still don't have permissions correct for doing the needed PutBuckedLifecycleConfiguration operation done by expire. And the failed background job is causing the main delete request to fail with a 504. (Same for deleting tile caches directly.) --- app/tasks/delete_assets.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/app/tasks/delete_assets.py b/app/tasks/delete_assets.py index 2528d1a0c..f04c9a2b4 100644 --- a/app/tasks/delete_assets.py +++ b/app/tasks/delete_assets.py @@ -11,7 +11,11 @@ async def delete_all_assets(dataset: str, version: str) -> None: await delete_database_table_asset(dataset, version) delete_s3_objects(DATA_LAKE_BUCKET, f"{dataset}/{version}/") - expire_s3_objects(TILE_CACHE_BUCKET, f"{dataset}/{version}/") + + # We don't yet have the correct PutBucketLifecycleConfiguration permission for + # this expire_s3_objects call. The failure of this background task somehow causes + # the main delete request to return a network error (504) + #expire_s3_objects(TILE_CACHE_BUCKET, f"{dataset}/{version}/") flush_cloudfront_cache(TILE_CACHE_CLOUDFRONT_ID, [f"/{dataset}/{version}/*"])