Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Karlburke/search api reindex collection support #557

Merged
merged 3 commits into from
Oct 23, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 0 additions & 27 deletions entity-api-spec.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1707,33 +1707,6 @@ paths:
description: The target entity could not be found
'500':
description: Internal error
'/collections/{id}':
get:
summary: 'Returns the information of the Collection specified by the uuid with all connected datasets. If a valid token is provided with group membership in the HuBMAP-Read group any collection matching the id will be returned. Otherwise if no token is provided or a valid token with no HuBMAP-Read group membership then only a public collection will be returned. Public collections are defined as being published via a DOI (collection.doi_registered == true) and at least one of the connected datasets is public (dataset.metadata.data_access_level == ''public''). For public collections only connected datasets that are public are returned with it.'
parameters:
- name: id
in: path
description: The unique identifier of entity. This identifier can be either an HuBMAP ID (e.g. HBM123.ABCD.456) or UUID
required: true
schema:
type: string
responses:
'200':
description: The collection is returned
content:
application/json:
schema:
type: array
items:
$ref: '#/components/schemas/Collection'
'400':
description: Invalid or misformatted entity identifier
'401':
description: The user's token has expired or the user did not supply a valid token
'404':
description: The target entity could not be found
'500':
description: Internal error
'/entities/new/{entity_type}':
post:
summary: Create a new entity of the target type
Expand Down
201 changes: 2 additions & 199 deletions src/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -537,90 +537,6 @@ def get_entity_visibility(id):

return jsonify(entity_scope.value)

"""
Retrieve the collection detail by id

The gateway treats this endpoint as public accessible

An optional Globus groups token can be provided in a standard Authentication Bearer header. If a valid token
is provided with group membership in the HuBMAP-Read group any collection matching the id will be returned.
otherwise if no token is provided or a valid token with no HuBMAP-Read group membership then
only a public collection will be returned. Public collections are defined as being published via a DOI
(collection.registered_doi not null) and at least one of the connected datasets is public
(dataset.status == 'Published'). For public collections only connected datasets that are
public are returned with it.

By default we only reuturn the following Dataset properties:

- collection.dataset.uuid
- collection.dataset.hubmap_id
- collection.dataset.data_types
- collection.dataset.status
- collection.dataset.last_modified_timestamp
- collection.dataset.created_by_user_displayname

Parameters
----------
id : str
The HuBMAP ID (e.g. HBM123.ABCD.456) or UUID of target collection

Returns
-------
json
The collection detail with a list of connected datasets (only public datasets
if user doesn't have the right access permission)
"""
@app.route('/collections/<id>', methods = ['GET'])
def get_collection(id):
# Token is not required, but if an invalid token provided,
# we need to tell the client with a 401 error
validate_token_if_auth_header_exists(request)

# Use the internal token to query the target collection
# since public collections don't require user token
token = get_internal_token()

# Get the entity dict from cache if exists
# Otherwise query against uuid-api and neo4j to get the entity dict if the id exists
collection_dict = query_target_entity(id, token)

# A bit validation
if collection_dict['entity_type'] != 'Collection':
bad_request_error("Target entity of the given id is not a collection")

# Try to get user token from Authorization header
# It's highly possible that there's no token provided
user_token = get_user_token(request)

# The user_token is flask.Response on error
# Without token, the user can only access public collections, modify the collection result
# by only returning public datasets attached to this collection
if isinstance(user_token, Response):
# When the requested collection is not public, send back 401
if ('registered_doi' not in collection_dict) or ('doi_url' not in collection_dict):
# Require a valid token in this case
unauthorized_error("The requested collection is not public, a Globus token with the right access permission is required.")

# Otherwise only return the public datasets attached to this collection
# for Collection.datasets property
complete_dict = get_complete_public_collection_dict(collection_dict)
else:
# When the groups token is valid, but the user doesn't belong to HuBMAP-READ group
# Or the token is valid but doesn't contain group information (auth token or transfer token)
# Only return the public datasets attached to this Collection
if not user_in_hubmap_read_group(request):
complete_dict = get_complete_public_collection_dict(collection_dict)
else:
# We'll need to return all the properties including those
# generated by `on_read_trigger` to have a complete result
complete_dict = schema_manager.get_complete_entity_result(user_token, collection_dict)

# Will also filter the result based on schema
normalized_complete_dict = schema_manager.normalize_entity_result_for_response(complete_dict)

# Response with the final result
return jsonify(normalized_complete_dict)

def _get_entity_visibility(normalized_entity_type, entity_dict):
if normalized_entity_type not in schema_manager.get_all_entity_types():
logger.log( logging.ERROR
Expand Down Expand Up @@ -893,8 +809,7 @@ def get_entity_types():
Parameters
----------
entity_type : str
One of the supported entity types: Dataset, Sample, Donor
Will handle Collection via API endpoint `/collections`
One of the supported entity types: Dataset, Collection, Sample, Donor

Returns
-------
Expand Down Expand Up @@ -965,80 +880,6 @@ def get_entities_by_type(entity_type):
# Response with the final result
return jsonify(final_result)

"""
Retrieve all the public collections

The gateway treats this endpoint as public accessible

Result filtering is supported based on query string
For example: /collections?property=uuid

Only return public collections, for either
- a valid token in HuBMAP-Read group,
- a valid token with no HuBMAP-Read group or
- no token at all

Public collections are defined as being published via a DOI
(collection.registered_doi is not null) and at least one of the connected datasets is published
(dataset.status == 'Published'). For public collections only connected datasets that are
published are returned with it.

Returns
-------
json
A list of all the public collection dictionaries (with attached public datasts)
"""
@app.route('/collections', methods = ['GET'])
def get_collections():
final_result = []

# Token is not required, but if an invalid token provided,
# we need to tell the client with a 401 error
validate_token_if_auth_header_exists(request)

normalized_entity_type = 'Collection'

# Result filtering based on query string
if bool(request.args):
property_key = request.args.get('property')

if property_key is not None:
result_filtering_accepted_property_keys = ['uuid']

# Validate the target property
if property_key not in result_filtering_accepted_property_keys:
bad_request_error(f"Only the following property keys are supported in the query string: {COMMA_SEPARATOR.join(result_filtering_accepted_property_keys)}")

# Only return a list of the filtered property value of each public collection
final_result = app_neo4j_queries.get_public_collections(neo4j_driver_instance, property_key)
else:
bad_request_error("The specified query string is not supported. Use '?property=<key>' to filter the result")
# Return all the details if no property filtering
else:
# Use the internal token since no user token is requried to access public collections
token = get_internal_token()

# Get back a list of public collections dicts
collections_list = app_neo4j_queries.get_public_collections(neo4j_driver_instance)

# Modify the Collection.datasets property for each collection dict
# to contain only public datasets
for collection_dict in collections_list:
# Only return the public datasets attached to this collection for Collection.datasets property
collection_dict = get_complete_public_collection_dict(collection_dict)

# Generate trigger data and merge into a big dict
# and skip some of the properties that are time-consuming to generate via triggers
properties_to_skip = ['datasets']
complete_collections_list = schema_manager.get_complete_entities_list(token, collections_list, properties_to_skip)

# Final result after normalization
final_result = schema_manager.normalize_entities_list_for_response(complete_collections_list)

# Response with the final result
return jsonify(final_result)


"""
Create an entity of the target type in neo4j

Expand All @@ -1049,7 +890,7 @@ def get_collections():
Parameters
----------
entity_type : str
One of the target entity types (case-insensitive since will be normalized): Dataset, Donor, Sample, Upload
One of the target entity types (case-insensitive since will be normalized): Dataset, Donor, Sample, Upload, Collection

Returns
-------
Expand Down Expand Up @@ -4017,44 +3858,6 @@ def validate_token_if_auth_header_exists(request):
def get_internal_token():
return auth_helper_instance.getProcessSecret()


"""
Return the complete collection dict for a given raw collection dict

Parameters
----------
collection_dict : dict
The raw collection dict returned by Neo4j

Returns
-------
dict
A dictionary of complete collection detail with all the generated 'on_read_trigger' data
The generated Collection.datasts contains only public datasets
if user/token doesn't have the right access permission
"""
def get_complete_public_collection_dict(collection_dict):
# Use internal token to query entity since
# no user token is required to access a public collection
token = get_internal_token()

# Collection.datasets is transient property and generated by the trigger method
# We'll need to return all the properties including those
# generated by `on_read_trigger` to have a complete result
complete_dict = schema_manager.get_complete_entity_result(token, collection_dict)

# Loop through Collection.datasets and only return the published/public datasets
public_datasets = []
for dataset in complete_dict['datasets']:
if dataset['status'].lower() == DATASET_STATUS_PUBLISHED:
public_datasets.append(dataset)

# Modify the result and only show the public datasets in this collection
complete_dict['datasets'] = public_datasets

return complete_dict


"""
Generate 'before_create_triiger' data and create the entity details in Neo4j

Expand Down
47 changes: 0 additions & 47 deletions src/app_neo4j_queries.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,53 +97,6 @@ def get_entities_by_type(neo4j_driver, entity_type, property_key = None):

return results

"""
Get all the public collection nodes

Parameters
----------
neo4j_driver : neo4j.Driver object
The neo4j database connection pool
property_key : str
A target property key for result filtering

Returns
-------
list
A list of public collections returned from the Cypher query
"""
def get_public_collections(neo4j_driver, property_key = None):
results = []

if property_key:
query = (f"MATCH (e:Collection) "
f"WHERE e.registered_doi IS NOT NULL AND e.doi_url IS NOT NULL "
# COLLECT() returns a list
# apoc.coll.toSet() reruns a set containing unique nodes
f"RETURN apoc.coll.toSet(COLLECT(e.{property_key})) AS {record_field_name}")
else:
query = (f"MATCH (e:Collection) "
f"WHERE e.registered_doi IS NOT NULL AND e.doi_url IS NOT NULL "
# COLLECT() returns a list
# apoc.coll.toSet() reruns a set containing unique nodes
f"RETURN apoc.coll.toSet(COLLECT(e)) AS {record_field_name}")

logger.info("======get_public_collections() query======")
logger.info(query)

with neo4j_driver.session() as session:
record = session.read_transaction(schema_neo4j_queries.execute_readonly_tx, query)

if record and record[record_field_name]:
if property_key:
# Just return the list of property values from each entity node
results = record[record_field_name]
else:
# Convert the list of nodes to a list of dicts
results = schema_neo4j_queries.nodes_to_dicts(record[record_field_name])

return results

"""
Retrieve the ancestor organ(s) of a given entity

Expand Down
2 changes: 1 addition & 1 deletion src/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ PyYAML==5.4.1
# Use the branch name of commons from github for testing new changes made in commons from different branch
# Default is main branch specified in docker-compose.development.yml if not set
# git+https://github.com/hubmapconsortium/commons.git@${COMMONS_BRANCH}#egg=hubmap-commons
hubmap-commons==2.1.11
hubmap-commons==2.1.12

# For unit test
nose2==0.10.0