diff --git a/.github/workflows/terraform_destroy_on_delete.yaml b/.github/workflows/terraform_destroy_on_delete.yaml index 171d2b337..8f948cb0a 100644 --- a/.github/workflows/terraform_destroy_on_delete.yaml +++ b/.github/workflows/terraform_destroy_on_delete.yaml @@ -4,7 +4,7 @@ on: [delete] jobs: build: - if: contains(github.event.ref_type, 'branch') && (! contains(github.event.ref, 'master')) && (! contains(github.event.ref, 'develop')) + if: contains(github.event.ref_type, 'branch') && (! github.event.ref == 'master') && (! github.event.ref == 'develop') runs-on: ubuntu-latest steps: - uses: actions/checkout@v1 diff --git a/.github/workflows/terraform_plan.yaml b/.github/workflows/terraform_plan.yaml index abba75df6..5102d9339 100644 --- a/.github/workflows/terraform_plan.yaml +++ b/.github/workflows/terraform_plan.yaml @@ -9,15 +9,16 @@ jobs: steps: - uses: actions/checkout@v1 - name: Plan production - if: success() && contains(github.base_ref, 'master') + if: success() && github.base_ref == 'master' env: ENV: production AWS_ACCESS_KEY_ID: ${{ secrets.aws_key_production }} AWS_SECRET_ACCESS_KEY: ${{ secrets.aws_secret_production }} AWS_REGION: ${{ secrets.aws_region_production }} run: ./scripts/infra plan -w ${{ github.base_ref }} + - name: Plan staging - if: success() && contains(github.base_ref, 'develop') + if: success() && github.base_ref == 'develop' env: ENV: staging AWS_ACCESS_KEY_ID: ${{ secrets.aws_key_staging }} diff --git a/.isort.cfg b/.isort.cfg index 689f896bf..9c838702b 100644 --- a/.isort.cfg +++ b/.isort.cfg @@ -2,4 +2,4 @@ line_length = 88 multi_line_output = 3 include_trailing_comma = True -known_third_party = _pytest,aenum,affine,aiohttp,alembic,async_lru,asyncpg,aws_utils,boto3,botocore,click,docker,errors,fastapi,fiona,gdal_utils,geoalchemy2,geojson,gfw_pixetl,gino,gino_starlette,google,httpx,httpx_auth,logger,logging_utils,moto,numpy,orjson,osgeo,pandas,pendulum,pglast,psutil,psycopg2,pydantic,pyproj,pytest,pytest_asyncio,rasterio,retrying,shapely,sqlalchemy,sqlalchemy_utils,starlette,tileputty,typer +known_third_party = _pytest,aenum,affine,aiohttp,alembic,async_lru,asyncpg,aws_utils,boto3,botocore,click,docker,errors,fastapi,fiona,gdal_utils,geoalchemy2,geojson,gfw_pixetl,gino,gino_starlette,google,httpx,httpx_auth,logger,logging_utils,moto,numpy,orjson,osgeo,pandas,pendulum,pglast,psutil,psycopg2,pydantic,pyproj,pytest,pytest_asyncio,rasterio,shapely,sqlalchemy,sqlalchemy_utils,starlette,tileputty,typer diff --git a/app/crud/metadata.py b/app/crud/metadata.py index d05976744..8765d0746 100644 --- a/app/crud/metadata.py +++ b/app/crud/metadata.py @@ -181,12 +181,44 @@ async def get_asset_metadata(asset_id: UUID): async def update_asset_metadata(asset_id: UUID, **data) -> ORMAssetMetadata: """Update asset metadata.""" fields = data.pop("fields", None) + bands = data.pop("bands", None) asset_metadata: ORMAssetMetadata = await get_asset_metadata(asset_id) if data: await asset_metadata.update(**data).apply() + bands_metadata = [] + if bands: + for band in bands: + try: + pixel_meaning = band.pop("pixel_meaning") + band_metadata = await update_band_metadata( + asset_metadata.id, pixel_meaning, **band + ) + except RecordNotFoundError: + bands_metadata = await create_raster_band_metadata( + asset_metadata.id, **bands + ) + bands_metadata.append(band_metadata) + + asset_metadata.bands = bands_metadata + + fields_metadata = [] + if fields: + for field in fields: + try: + field_metadata = await update_field_metadata( + asset_metadata.id, field["name"], **field + ) + except RecordNotFoundError: + field_metadata = await create_field_metadata(asset_metadata.id, **field) + fields_metadata.append(field_metadata) + + asset_metadata.fields = fields_metadata + + return asset_metadata + fields_metadata = [] if fields: for field in fields: @@ -240,6 +272,18 @@ async def update_field_metadata( return field_metadata +async def update_band_metadata( + metadata_id: UUID, pixel_meaning: str, **data +) -> ORMFieldMetadata: + band_metadata: ORMRasterBandMetadata = await get_asset_raster_band( + metadata_id, pixel_meaning + ) + + await band_metadata.update(**data).apply() + + return band_metadata + + async def get_asset_fields(asset_metadata_id: UUID) -> List[ORMFieldMetadata]: fields_metadata: List[ORMFieldMetadata] = await ( ORMFieldMetadata.query.where( @@ -272,6 +316,21 @@ async def get_asset_field(asset_metadata_id: UUID, field_name: str) -> ORMFieldM return field_metadata +async def get_asset_raster_band( + asset_metadata_id: UUID, pixel_meaning: str +) -> ORMRasterBandMetadata: + band_metadata: ORMRasterBandMetadata = await ORMRasterBandMetadata.get( + [asset_metadata_id, pixel_meaning] + ) + + if band_metadata is None: + raise RecordNotFoundError( + f"No band metadata record found for pixel meaning {pixel_meaning}." + ) + + return band_metadata + + def update_metadata(row: Base, parent: Base): """Dynamically update metadata with parent metadata. diff --git a/app/models/orm/asset_metadata.py b/app/models/orm/asset_metadata.py index 7b031ba40..4b51c77b9 100644 --- a/app/models/orm/asset_metadata.py +++ b/app/models/orm/asset_metadata.py @@ -52,8 +52,8 @@ class RasterBandMetadata(db.Model): name="asset_metadata_id_fk", onupdate="CASCADE", ondelete="CASCADE", - primary_key=True, ), + primary_key=True, ) pixel_meaning = db.Column(db.String, primary_key=True) description = db.Column(db.String) diff --git a/app/models/pydantic/versions.py b/app/models/pydantic/versions.py index 53c665c86..908256527 100644 --- a/app/models/pydantic/versions.py +++ b/app/models/pydantic/versions.py @@ -17,7 +17,8 @@ class Version(BaseRecord): metadata: Union[VersionMetadataOut, BaseModel] status: VersionStatus = VersionStatus.pending - assets: List[Tuple[str, str]] = list() + # Each element of assets is a tuple (asset_type, assert_uri, asset_id) + assets: List[Tuple[str, str, str]] = list() class VersionCreateIn(StrictBaseModel): diff --git a/app/routes/assets/asset.py b/app/routes/assets/asset.py index e408d639f..0d39879be 100644 --- a/app/routes/assets/asset.py +++ b/app/routes/assets/asset.py @@ -11,11 +11,6 @@ from typing import List, Optional, Union from uuid import UUID -# from fastapi import APIRouter, BackgroundTasks, Depends, HTTPException, Path, status -from fastapi.responses import ORJSONResponse -from starlette.responses import JSONResponse - -from app.models.pydantic.responses import Response from fastapi import ( APIRouter, BackgroundTasks, @@ -27,13 +22,18 @@ status, ) +# from fastapi import APIRouter, BackgroundTasks, Depends, HTTPException, Path, status +from fastapi.responses import ORJSONResponse +from starlette.responses import JSONResponse + +from app.models.pydantic.responses import Response from app.settings.globals import API_URL from ...authentication.token import is_admin from ...crud import assets from ...crud import metadata as metadata_crud from ...crud import tasks -from ...errors import RecordAlreadyExistsError, RecordNotFoundError +from ...errors import BadRequestError, RecordAlreadyExistsError, RecordNotFoundError from ...models.enum.assets import is_database_asset, is_single_file_asset from ...models.orm.asset_metadata import FieldMetadata as ORMFieldMetadata from ...models.orm.assets import Asset as ORMAsset @@ -112,6 +112,8 @@ async def update_asset( row: ORMAsset = await assets.update_asset(asset_id, **input_data) except RecordNotFoundError as e: raise HTTPException(status_code=404, detail=str(e)) + except BadRequestError as e: + raise HTTPException(status_code=400, detail=str(e)) except NotImplementedError as e: raise HTTPException(status_code=501, detail=str(e)) diff --git a/app/routes/datasets/queries.py b/app/routes/datasets/queries.py index 5a43fb307..4e39c107c 100755 --- a/app/routes/datasets/queries.py +++ b/app/routes/datasets/queries.py @@ -7,13 +7,13 @@ from uuid import UUID, uuid4 import httpx +from async_lru import alru_cache from asyncpg import DataError, InsufficientPrivilegeError, SyntaxOrAccessError from fastapi import APIRouter, Depends, HTTPException, Query from fastapi import Request as FastApiRequest from fastapi import Response as FastApiResponse from fastapi.encoders import jsonable_encoder from fastapi.logger import logger - # from fastapi.openapi.models import APIKey from fastapi.responses import RedirectResponse from pglast import printers # noqa @@ -21,11 +21,9 @@ from pglast.parser import ParseError from pglast.printer import RawStream from pydantic.tools import parse_obj_as -from sqlalchemy.sql import and_ from ...authentication.token import is_gfwpro_admin_for_query from ...application import db - # from ...authentication.api_keys import get_api_key from ...crud import assets from ...models.enum.assets import AssetType @@ -62,7 +60,6 @@ from ...models.enum.queries import QueryFormat, QueryType from ...models.orm.assets import Asset as AssetORM from ...models.orm.queries.raster_assets import latest_raster_tile_sets -from ...models.orm.versions import Version as VersionORM from ...models.pydantic.asset_metadata import RasterTable, RasterTableRow from ...models.pydantic.creation_options import NoDataType from ...models.pydantic.geostore import Geometry, GeostoreCommon @@ -659,13 +656,14 @@ async def _query_raster_lambda( def _get_area_density_name(nm): - """Return empty string if nm doesn't not have an area-density suffix, else + """Return empty string if nm doesn't have an area-density suffix, else return nm with the area-density suffix removed.""" for suffix in AREA_DENSITY_RASTER_SUFFIXES: if nm.endswith(suffix): return nm[:-len(suffix)] return "" + def _get_default_layer(dataset, pixel_meaning): default_type = pixel_meaning area_density_name = _get_area_density_name(default_type) @@ -683,6 +681,7 @@ def _get_default_layer(dataset, pixel_meaning): return f"{dataset}__{default_type}" +@alru_cache(maxsize=16, ttl=300.0) async def _get_data_environment(grid: Grid) -> DataEnvironment: # get all raster tile set assets with the same grid. latest_tile_sets = await db.all(latest_raster_tile_sets, {"grid": grid}) diff --git a/app/routes/datasets/versions.py b/app/routes/datasets/versions.py index 9f54c7d25..50cdf0356 100644 --- a/app/routes/datasets/versions.py +++ b/app/routes/datasets/versions.py @@ -500,14 +500,14 @@ async def _version_response( associated assets.""" assets: List[ORMAsset] = ( - await ORMAsset.select("asset_type", "asset_uri") + await ORMAsset.select("asset_type", "asset_uri", "asset_id") .where(ORMAsset.dataset == dataset) .where(ORMAsset.version == version) .where(ORMAsset.status == AssetStatus.saved) .gino.all() ) data = Version.from_orm(data).dict(by_alias=True) - data["assets"] = [(asset[0], asset[1]) for asset in assets] + data["assets"] = [(asset[0], asset[1], str(asset[2])) for asset in assets] return VersionResponse(data=Version(**data)) @@ -560,7 +560,7 @@ def _verify_source_file_access(sources: List[str]) -> None: raise HTTPException( status_code=400, detail=( - "Cannot access all of the source files. " + "Cannot access all of the source files (non-existent or access denied). " f"Invalid sources: {invalid_sources}" ), ) diff --git a/app/tasks/delete_assets.py b/app/tasks/delete_assets.py index 4582bae15..2528d1a0c 100644 --- a/app/tasks/delete_assets.py +++ b/app/tasks/delete_assets.py @@ -11,7 +11,7 @@ async def delete_all_assets(dataset: str, version: str) -> None: await delete_database_table_asset(dataset, version) delete_s3_objects(DATA_LAKE_BUCKET, f"{dataset}/{version}/") - # expire_s3_objects(TILE_CACHE_BUCKET, f"{dataset}/{version}/") + expire_s3_objects(TILE_CACHE_BUCKET, f"{dataset}/{version}/") flush_cloudfront_cache(TILE_CACHE_CLOUDFRONT_ID, [f"/{dataset}/{version}/*"]) diff --git a/app/utils/aws.py b/app/utils/aws.py index fe981be42..2f9e2da66 100644 --- a/app/utils/aws.py +++ b/app/utils/aws.py @@ -1,8 +1,10 @@ from typing import Any, Dict, List, Optional, Sequence import boto3 +import botocore import httpx from httpx_auth import AWS4Auth +from fastapi.logger import logger from ..settings.globals import ( AWS_REGION, @@ -108,7 +110,11 @@ def get_aws_files( if exit_after_max and num_matches >= exit_after_max: break - except s3_client.exceptions.NoSuchBucket: + # Strangely, s3_client.exceptions has NoSuchBucket, but doesn't have + # AccessDenied, even though you can get that error, so we just catch all botocore + # exceptions. + except botocore.exceptions.ClientError as error: + logger.warning(f"get_aws_file: {error}") matches = list() return matches diff --git a/batch/scripts/create_vector_tile_cache.sh b/batch/scripts/create_vector_tile_cache.sh index ee96564ca..4fe2547e7 100755 --- a/batch/scripts/create_vector_tile_cache.sh +++ b/batch/scripts/create_vector_tile_cache.sh @@ -34,7 +34,7 @@ echo "Fetch NDJSON data from Data Lake ${SRC} -> ${DATASET}" aws s3 cp "${SRC}" "${DATASET}" --no-progress echo "Build Tile Cache" -tippecanoe -Z"${MIN_ZOOM}" -z"${MAX_ZOOM}" -e tilecache "${STRATEGY[@]}" -P -n "${DATASET}" "${DATASET}" +tippecanoe -Z"${MIN_ZOOM}" -z"${MAX_ZOOM}" -e tilecache "${STRATEGY[@]}" -P -n "${DATASET}" "${DATASET}" --preserve-input-order echo "Upload tiles to S3" tileputty tilecache --bucket "${TILE_CACHE}" --dataset "${DATASET}" --version "${VERSION}" --implementation "${IMPLEMENTATION}" --cores "${NUM_PROCESSES}" \ No newline at end of file diff --git a/newrelic.ini b/newrelic.ini index e3f769ced..6e6083fd2 100644 --- a/newrelic.ini +++ b/newrelic.ini @@ -37,7 +37,7 @@ app_name = GFW Data API # New Relic offers distributed tracing for monitoring and analyzing modern # distributed systems.Enable distributed tracing. -distributed_tracing.enabled = +# distributed_tracing.enabled = false # When "true", the agent collects performance data about your # application and reports this data to the New Relic UI at @@ -115,7 +115,7 @@ high_security = false # then add "request.parameters.*" to the "attributes.include" # setting. For details about attributes configuration, please # consult the documentation. -# attributes.include = request.parameters.* +attributes.include = request.parameters.* # The transaction tracer captures deep information about slow # transactions and sends this to the UI on a periodic basis. The @@ -203,8 +203,10 @@ monitor_mode = false [newrelic:staging] app_name = GFW Data API (Staging) monitor_mode = true +distributed_tracing.enabled = false [newrelic:production] monitor_mode = true +distributed_tracing.enabled = true # --------------------------------------------------------------------------- \ No newline at end of file diff --git a/tests/crud/test_assets.py b/tests/crud/test_assets.py index c36340ea1..a9a5c4ad8 100644 --- a/tests/crud/test_assets.py +++ b/tests/crud/test_assets.py @@ -10,18 +10,22 @@ create_asset, delete_asset, get_asset, - get_assets, get_assets_by_filter, update_asset, ) from app.crud.datasets import create_dataset from app.crud.versions import create_version from app.errors import RecordAlreadyExistsError, RecordNotFoundError -from app.models.pydantic.change_log import ChangeLog from app.models.pydantic.asset_metadata import DatabaseTableMetadata -from app.models.pydantic.metadata import VersionMetadata, DatasetMetadata +from app.models.pydantic.change_log import ChangeLog +from app.models.pydantic.metadata import DatasetMetadata, VersionMetadata -from ..utils import dataset_metadata, version_metadata, asset_metadata +from ..utils import ( + asset_metadata, + dataset_metadata, + raster_asset_metadata, + version_metadata, +) @pytest.mark.asyncio @@ -243,3 +247,58 @@ async def test_assets_metadata(): assert ( asset.metadata.fields[0].data_type == asset_metadata["fields"][0]["data_type"] ) + + +@pytest.mark.asyncio +async def test_band_metadata(): + """Testing band metadata.""" + + dataset = "test" + version = "v1.1.1" + + # Add a dataset + async with ContextEngine("WRITE"): + await create_dataset( + dataset, metadata=DatasetMetadata(**dataset_metadata).dict(by_alias=False) + ) + await create_version( + dataset, + version, + metadata=VersionMetadata(**version_metadata).dict(by_alias=True), + ) + new_asset = await create_asset( + dataset, + version, + asset_type="Raster tile set", + asset_uri="s3://path/to/file", + metadata=raster_asset_metadata, + ) + + asset_id = new_asset.asset_id + assert ( + new_asset.metadata.bands[0].values_table + == raster_asset_metadata["bands"][0]["values_table"] + ) + + updated_band_metadata = { + "bands": [ + { + "pixel_meaning": "year", + "values_table": { + "rows": [ + {"value": 1, "meaning": 2001}, + {"value": 2, "meaning": 2002}, + {"value": 3, "meaning": 2003}, + ], + }, + } + ] + } + + async with ContextEngine("WRITE"): + asset = await update_asset(asset_id, metadata=updated_band_metadata) + + assert ( + asset.metadata.bands[0].values_table + == updated_band_metadata["bands"][0]["values_table"] + ) diff --git a/tests/routes/datasets/test_versions.py b/tests/routes/datasets/test_versions.py old mode 100644 new mode 100755 index 492903d75..e1cc6a1aa --- a/tests/routes/datasets/test_versions.py +++ b/tests/routes/datasets/test_versions.py @@ -310,7 +310,7 @@ async def test_invalid_source_uri(async_client: AsyncClient): assert response.json()["status"] == "failed" assert ( response.json()["message"] - == f"Cannot access all of the source files. Invalid sources: ['{bad_uri}']" + == f"Cannot access all of the source files (non-existent or access denied). Invalid sources: ['{bad_uri}']" ) # Create a version with a valid source_uri so we have something to append to @@ -332,7 +332,7 @@ async def test_invalid_source_uri(async_client: AsyncClient): assert response.json()["status"] == "failed" assert ( response.json()["message"] - == f"Cannot access all of the source files. Invalid sources: ['{bad_uri}']" + == f"Cannot access all of the source files (non-existent or access denied). Invalid sources: ['{bad_uri}']" ) # Test appending to a version that DOESN'T exist diff --git a/tests/utils.py b/tests/utils.py index 307578ea4..c3dba73e6 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -39,6 +39,20 @@ "fields": [{"name": "field1", "data_type": "numeric", "unit": "meters"}] } +raster_asset_metadata = { + "bands": [ + { + "pixel_meaning": "year", + "values_table": { + "rows": [ + {"value": 1, "meaning": 2001}, + {"value": 2, "meaning": 2002}, + ], + }, + } + ] +} + generic_version_payload = { "metadata": version_metadata, "creation_options": { diff --git a/tests_v2/conftest.py b/tests_v2/conftest.py index 33fbb3a53..99c052914 100755 --- a/tests_v2/conftest.py +++ b/tests_v2/conftest.py @@ -189,6 +189,7 @@ async def create_vector_source_version( monkeypatch.setattr(batch, "submit_batch_job", batch_job_mock.submit_batch_job) monkeypatch.setattr(vector_source_assets, "is_zipped", bool_function_closure(False)) monkeypatch.setattr(delete_assets, "delete_s3_objects", int_function_closure(1)) + monkeypatch.setattr(delete_assets, "expire_s3_objects", dict_function_closure({})) monkeypatch.setattr(versions, "flush_cloudfront_cache", dict_function_closure({})) monkeypatch.setattr( delete_assets, "flush_cloudfront_cache", dict_function_closure({}) @@ -248,6 +249,7 @@ async def generic_raster_version( monkeypatch.setattr(versions, "_verify_source_file_access", void_coroutine) monkeypatch.setattr(batch, "submit_batch_job", batch_job_mock.submit_batch_job) monkeypatch.setattr(delete_assets, "delete_s3_objects", int_function_closure(1)) + monkeypatch.setattr(delete_assets, "expire_s3_objects", dict_function_closure({})) monkeypatch.setattr(raster_tile_set_assets, "get_extent", get_extent_mocked) monkeypatch.setattr( delete_assets, "flush_cloudfront_cache", dict_function_closure({}) diff --git a/tests_v2/unit/app/routes/analysis/test_analysis.py b/tests_v2/unit/app/routes/analysis/test_analysis.py index d8dcd6673..1def7fb39 100644 --- a/tests_v2/unit/app/routes/analysis/test_analysis.py +++ b/tests_v2/unit/app/routes/analysis/test_analysis.py @@ -81,6 +81,9 @@ async def test_analysis_with_huge_geostore( async def test_raster_analysis_payload_shape( generic_dataset, async_client: AsyncClient, monkeypatch: MonkeyPatch ): + """Note that this test depends on the output of _get_data_environment + which will likely have cached values from other tests, so we clear it.""" + dataset_name, _ = generic_dataset pixel_meaning: str = "date_conf" no_data_value = 0 @@ -103,6 +106,9 @@ async def test_raster_analysis_payload_shape( ) monkeypatch.setattr(geostore.rw_api, "get_geostore", mock_rw_get_geostore) + # The other tests will have polluted the data env cache. Clear it. + queries._get_data_environment.cache_clear() + _ = await async_client.get( f"/analysis/zonal/17076d5ea9f214a5bdb68cc40433addb?geostore_origin=rw&group_by=umd_tree_cover_loss__year&filters=is__umd_regional_primary_forest_2001&filters=umd_tree_cover_density_2000__30&sum=area__ha&start_date=2001" ) diff --git a/tests_v2/unit/app/routes/datasets/test_version.py b/tests_v2/unit/app/routes/datasets/test_version.py index 19f5b33bf..8d4b570c6 100644 --- a/tests_v2/unit/app/routes/datasets/test_version.py +++ b/tests_v2/unit/app/routes/datasets/test_version.py @@ -1,6 +1,7 @@ import pytest from _pytest.monkeypatch import MonkeyPatch from httpx import AsyncClient +import re from app.routes.datasets import versions from app.tasks import batch @@ -17,6 +18,14 @@ async def test_get_version(async_client: AsyncClient, generic_vector_source_vers assert resp.status_code == 200 data = resp.json() assert_jsend(data) + first_asset = data["data"]["assets"][0] + assert len(first_asset) == 3 + assert first_asset[0] == "Geo database table" + # Check asset_id looks reasonable + asset_id = first_asset[2] + assert len(asset_id) > 30 + pattern = re.compile(r'^[a-zA-Z0-9-]+$') + assert bool(pattern.match(asset_id)) @pytest.mark.asyncio