diff --git a/VERSION b/VERSION index 9f0db51d..63563755 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -2.1.38 +2.1.39 diff --git a/entity-api-spec.yaml b/entity-api-spec.yaml index 489a5c69..fbfc5548 100644 --- a/entity-api-spec.yaml +++ b/entity-api-spec.yaml @@ -406,59 +406,6 @@ components: - section - suspension description: "A code representing the type of specimen. Must be an organ, block, section, or suspension" - specimen_type: - type: string - enum: - - atacseq - - biopsy - - blood - - cell_lysate - - clarity_hydrogel - - codex - - cryosections_curls_from_fresh_frozen_oct - - cryosections_curls_rnalater - - ffpe_block - - ffpe_slide - - fixed_frozen_section_slide - - fixed_tissue_piece - - flash_frozen_liquid_nitrogen - - formalin_fixed_oct_block - - fresh_frozen_oct_block - - fresh_frozen_section_slide - - fresh_frozen_tissue - - fresh_frozen_tissue_section - - fresh_tissue - - frozen_cell_pellet_buffy_coat - - gdna - - module - - nuclei - - nuclei_rnalater - - organ - - organ_piece - - other - - pbmc - - pfa_fixed_frozen_oct_block - - plasma - - protein - - ran_poly_a_enriched - - rna_total - - rnalater_treated_and_stored - - rnaseq - - scatacseq - - scrnaseq - - segment - - seqfish - - sequence_library - - serum - - single_cell_cryopreserved - - snatacseq - - snrnaseq - - tissue_lysate - - wgs - description: "DEPRECATED: No longer a required field. A code representing the type of specimen. Must be one of the codes specified in: [tissue sample types](https://github.com/hubmapconsortium/search-api/blob/main/src/search-schema/data/definitions/enums/tissue_sample_types.yaml)" - specimen_type_other: - type: string - description: "The user provided sample type if the 'other' sample_type is chosen." protocol_url: type: string description: "The protocols.io doi url pointing the protocol under wich the sample was obtained and/or prepared." @@ -1874,6 +1821,49 @@ paths: description: The target dataset could not be found '500': description: Internal error + '/datasets/{id}/multi-revisions': + get: + summary: 'Retrieve a list of all multi revisions of a dataset from the id of any dataset in the chain. E.g: If there are 5 revisions, and the id for revision 4 is given, a list of revisions 1-5 will be returned in reverse order (newest first). Non-public access is only required to retrieve information on non-published datasets. Output will be a list of dictionaries. Each dictionary contains the dataset revision number and its list of uuids. Optionally, the full dataset can be included for each.' + parameters: + - name: id + in: path + description: The unique identifier of entity. This identifier can be either an HuBMAP ID (e.g. HBM123.ABCD.456) or UUID + required: true + schema: + type: string + - name: include_dataset + in: query + description: A case insensitive string. Any value besides true will have no effect. If the string is 'true', the full dataset for each revision will be included in the response + required: false + schema: + type: string + enum: ['true', 'false'] + responses: + '200': + description: The list of revised datasets that the referenced dataset is a member of including the index number of the revision, where 1 is the oldest version of any revision chain + content: + application/json: + schema: + type: object + properties: + uuid: + type: string + description: The uuid of a dataset + revision_number: + type: integer + description: The number in the revision chain of this dataset where 1 is the oldest revision + dataset: + $ref: '#/components/schemas/Dataset' + '400': + description: Invalid or misformatted entity identifier, or the given entity is not a Dataset + '401': + description: The user's token has expired or the user did not supply a valid token + '403': + description: The user is not authorized to query the revision number of the given dataset. + '404': + description: The target dataset could not be found + '500': + description: Internal error '/datasets/{id}/revisions': get: summary: 'From a given ID of a versioned dataset, retrieve a list of every dataset in the chain ordered from most recent to oldest. The revision number, as well as the dataset uuid will be included. An optional parameter ?include_dataset=true will include the full dataset for each revision as well. Public/Consortium access rules apply, if is for a non-public dataset and no token or a token without membership in HuBMAP-Read group is sent with the request then a 403 response should be returned. If the given id is published, but later revisions are not and the user is not in HuBMAP-Read group, only published revisions will be returned. The field next_revision_uuid will not be returned if the next revision is unpublished' @@ -1890,7 +1880,7 @@ paths: required: false schema: type: string - enum: ['true', 'false'] + enum: [ 'true', 'false' ] responses: '200': description: The list of revised datasets that the referenced dataset is a member of including the index number of the revision, where 1 is the oldest version of any revision chain diff --git a/src/app.py b/src/app.py index 7878d726..00f1d523 100644 --- a/src/app.py +++ b/src/app.py @@ -44,9 +44,6 @@ global logger # Set logging format and level (default is warning) -# All the API logging is forwarded to the uWSGI server and gets written into the log file `log/uwsgi-entity-api.log` -# Log rotation is handled via logrotate on the host system with a configuration file -# Do NOT handle log file and rotation via the Python logging to avoid issues with multi-worker processes logging.basicConfig(format='[%(asctime)s] %(levelname)s in %(module)s: %(message)s', level=logging.DEBUG, datefmt='%Y-%m-%d %H:%M:%S') # Use `getLogger()` instead of `getLogger(__name__)` to apply the config to the root logger @@ -60,6 +57,7 @@ # Remove trailing slash / from URL base to avoid "//" caused by config with trailing slash app.config['UUID_API_URL'] = app.config['UUID_API_URL'].strip('/') app.config['INGEST_API_URL'] = app.config['INGEST_API_URL'].strip('/') +app.config['ONTOLOGY_API_URL'] = app.config['ONTOLOGY_API_URL'].strip('/') app.config['SEARCH_API_URL_LIST'] = [url.strip('/') for url in app.config['SEARCH_API_URL_LIST']] # This mode when set True disables the PUT and POST calls, used on STAGE to make entity-api READ-ONLY @@ -198,6 +196,7 @@ def http_internal_server_error(e): schema_manager.initialize(app.config['SCHEMA_YAML_FILE'], app.config['UUID_API_URL'], app.config['INGEST_API_URL'], + app.config['ONTOLOGY_API_URL'], auth_helper_instance, neo4j_driver_instance, memcached_client_instance, @@ -437,7 +436,6 @@ def get_ancestor_organs(id): bad_request_error(f"Unable to get the ancestor organs for this: {normalized_entity_type}," " supported entity types: Sample, Dataset, Publication") - # specimen_type -> sample_category 12/15/2022 if normalized_entity_type == 'Sample' and entity_dict['sample_category'].lower() == 'organ': bad_request_error("Unable to get the ancestor organ of an organ.") @@ -940,7 +938,6 @@ def create_entity(entity_type): # Check existence of the direct ancestor (either another Sample or Donor) direct_ancestor_dict = query_target_entity(direct_ancestor_uuid, user_token) - # specimen_type -> sample_category 12/15/2022 # `sample_category` is required on create sample_category = json_data_dict['sample_category'].lower() @@ -983,26 +980,31 @@ def create_entity(entity_type): # Also check existence of the previous revision dataset if specified if 'previous_revision_uuid' in json_data_dict: - previous_version_dict = query_target_entity(json_data_dict['previous_revision_uuid'], user_token) + if isinstance(json_data_dict['previous_revision_uuid'], list): + previous_revision_list = json_data_dict['previous_revision_uuid'] - # Make sure the previous version entity is either a Dataset or Sample (and publication 2/17/23) - if previous_version_dict['entity_type'] not in ['Sample'] and \ - not schema_manager.entity_type_instanceof(previous_version_dict['entity_type'], 'Dataset'): - bad_request_error(f"The previous_revision_uuid specified for this dataset must be either a Dataset or Sample or Publication") + nested_revisions = app_neo4j_queries.nested_previous_revisions(neo4j_driver_instance, previous_revision_list) + if nested_revisions: + bad_request_error(f"{nested_revisions[0][0]} is a revision of {nested_revisions[1][0]}. Datasets in previous_revision_uuid must not be revisions of eachother") + else: + previous_revision_list = [json_data_dict['previous_revision_uuid']] + for previous_revision in previous_revision_list: + previous_version_dict = query_target_entity(previous_revision, user_token) + + # Make sure the previous version entity is either a Dataset or Sample (and publication 2/17/23) + if not schema_manager.entity_type_instanceof(previous_version_dict['entity_type'], 'Dataset'): + bad_request_error(f"The previous_revision_uuid specified for this dataset must be either a Dataset or Sample or Publication") - # Also need to validate if the given 'previous_revision_uuid' has already had - # an existing next revision - # Only return a list of the uuids, no need to get back the list of dicts - next_revisions_list = app_neo4j_queries.get_next_revisions(neo4j_driver_instance, previous_version_dict['uuid'], 'uuid') + next_revision_is_latest = app_neo4j_queries.is_next_revision_latest(neo4j_driver_instance, previous_version_dict['uuid']) - # As long as the list is not empty, tell the users to use a different 'previous_revision_uuid' - if next_revisions_list: - bad_request_error(f"The previous_revision_uuid specified for this dataset has already had a next revision") + # As long as the list is not empty, tell the users to use a different 'previous_revision_uuid' + if not next_revision_is_latest: + bad_request_error(f"The previous_revision_uuid specified for this dataset has already had a next revision") - # Only published datasets can have revisions made of them. Verify that that status of the Dataset specified - # by previous_revision_uuid is published. Else, bad request error. - if previous_version_dict['status'].lower() != DATASET_STATUS_PUBLISHED: - bad_request_error(f"The previous_revision_uuid specified for this dataset must be 'Published' in order to create a new revision from it") + # Only published datasets can have revisions made of them. Verify that that status of the Dataset specified + # by previous_revision_uuid is published. Else, bad request error. + if previous_version_dict['status'].lower() != DATASET_STATUS_PUBLISHED: + bad_request_error(f"The previous_revision_uuid specified for this dataset must be 'Published' in order to create a new revision from it") # If the preceding "additional validations" did not raise an error, # generate 'before_create_trigger' data and create the entity details in Neo4j @@ -1108,7 +1110,6 @@ def create_multiple_samples(count): # sample's direct ancestor is a Donor. # Must be one of the codes from: https://github.com/hubmapconsortium/search-api/blob/main/src/search-schema/data/definitions/enums/organ_types.yaml if direct_ancestor_dict['entity_type'] == 'Donor': - # specimen_type -> sample_category 12/15/2022 # `sample_category` is required on create if json_data_dict['sample_category'].lower() != 'organ': bad_request_error("The sample_category must be organ since the direct ancestor is a Donor") @@ -2279,6 +2280,100 @@ def get_dataset_revision_number(id): return jsonify(revision_number) +""" +Retrieve a list of all multi revisions of a dataset from the id of any dataset in the chain. +E.g: If there are 5 revisions, and the id for revision 4 is given, a list of revisions +1-5 will be returned in reverse order (newest first). Non-public access is only required to +retrieve information on non-published datasets. Output will be a list of dictionaries. Each dictionary +contains the dataset revision number and its list of uuids. Optionally, the full dataset can be included for each. + +By default, only the revision number and uuids are included. To include the full dataset, the query +parameter "include_dataset" can be given with the value of "true". If this parameter is not included or +is set to false, the dataset will not be included. For example, to include the full datasets for each revision, +use '/datasets//multi-revisions?include_dataset=true'. To omit the datasets, either set include_dataset=false, or +simply do not include this parameter. + +Parameters +---------- +id : str + The HuBMAP ID (e.g. HBM123.ABCD.456) or UUID of target dataset + +Returns +------- +list + The list of revision datasets +""" +@app.route('/entities//multi-revisions', methods=['GET']) +@app.route('/datasets//multi-revisions', methods=['GET']) +def get_multi_revisions_list(id): + # By default, do not return dataset. Only return dataset if include_dataset is true + property_key = 'uuid' + if bool(request.args): + include_dataset = request.args.get('include_dataset') + if (include_dataset is not None) and (include_dataset.lower() == 'true'): + property_key = None + # Token is not required, but if an invalid token provided, + # we need to tell the client with a 401 error + validate_token_if_auth_header_exists(request) + + # Use the internal token to query the target entity + # since public entities don't require user token + token = get_internal_token() + + # Query target entity against uuid-api and neo4j and return as a dict if exists + entity_dict = query_target_entity(id, token) + normalized_entity_type = entity_dict['entity_type'] + + # Only for Dataset + if not schema_manager.entity_type_instanceof(normalized_entity_type, 'Dataset'): + abort_bad_req("The entity is not a Dataset. Found entity type:" + normalized_entity_type) + + # Only published/public datasets don't require token + if entity_dict['status'].lower() != DATASET_STATUS_PUBLISHED: + # Token is required and the user must belong to HuBMAP-READ group + token = get_user_token(request, non_public_access_required=True) + + # By now, either the entity is public accessible or + # the user token has the correct access level + # Get the all the sorted (DESC based on creation timestamp) revisions + sorted_revisions_list = app_neo4j_queries.get_sorted_multi_revisions(neo4j_driver_instance, entity_dict['uuid'], + fetch_all=user_in_hubmap_read_group(request), + property_key=property_key) + + # Skip some of the properties that are time-consuming to generate via triggers + properties_to_skip = [ + 'direct_ancestors', + 'collections', + 'upload', + 'title' + ] + + normalized_revisions_list = [] + sorted_revisions_list_merged = sorted_revisions_list[0] + sorted_revisions_list[1][::-1] + + if property_key is None: + for revision in sorted_revisions_list_merged: + complete_revision_list = schema_manager.get_complete_entities_list(token, revision, properties_to_skip) + normal = schema_manager.normalize_entities_list_for_response(complete_revision_list) + normalized_revisions_list.append(normal) + else: + normalized_revisions_list = sorted_revisions_list_merged + + # Now all we need to do is to compose the result list + results = [] + revision_number = len(normalized_revisions_list) + for revision in normalized_revisions_list: + result = { + 'revision_number': revision_number, + 'uuids': revision + } + results.append(result) + revision_number -= 1 + + return jsonify(results) + + + """ Retract a published dataset with a retraction reason and sub status @@ -2623,26 +2718,12 @@ def get_prov_info(): # Parsing the organ types yaml has to be done here rather than calling schema.schema_triggers.get_organ_description # because that would require using a urllib request for each dataset - response = schema_manager.make_request_get(SchemaConstants.ORGAN_TYPES_YAML) - - if response.status_code == 200: - yaml_file = response.text - try: - organ_types_dict = yaml.safe_load(yaml_file) - except yaml.YAMLError as e: - raise yaml.YAMLError(e) + organ_types_dict = schema_manager.get_organ_types() # As above, we parse te assay type yaml here rather than calling the special method for it because this avoids # having to access the resource for every dataset. - response = schema_manager.make_request_get(SchemaConstants.ASSAY_TYPES_YAML) - - if response.status_code == 200: - yaml_file = response.text - try: - assay_types_dict = yaml.safe_load(yaml_file) - except yaml.YAMLError as e: - raise yaml.YAMLError(e) - + assay_types_dict = schema_manager.get_assay_types() + # Processing and validating query parameters accepted_arguments = ['format', 'organ', 'has_rui_info', 'dataset_status', 'group_uuid'] return_json = False @@ -2744,8 +2825,6 @@ def get_prov_info(): first_sample_hubmap_id_list.append(item['hubmap_id']) first_sample_submission_id_list.append(item['submission_id']) first_sample_uuid_list.append(item['uuid']) - - # specimen_type -> sample_category 12/15/2022 first_sample_type_list.append(item['sample_category']) first_sample_portal_url_list.append(app.config['DOI_REDIRECT_URL'].replace('', 'sample').replace('', item['uuid'])) @@ -3007,25 +3086,11 @@ def get_prov_info_for_dataset(id): # Parsing the organ types yaml has to be done here rather than calling schema.schema_triggers.get_organ_description # because that would require using a urllib request for each dataset - response = schema_manager.make_request_get(SchemaConstants.ORGAN_TYPES_YAML) - - if response.status_code == 200: - yaml_file = response.text - try: - organ_types_dict = yaml.safe_load(yaml_file) - except yaml.YAMLError as e: - raise yaml.YAMLError(e) + organ_types_dict = schema_manager.get_organ_types() # As above, we parse te assay type yaml here rather than calling the special method for it because this avoids # having to access the resource for every dataset. - response = schema_manager.make_request_get(SchemaConstants.ASSAY_TYPES_YAML) - - if response.status_code == 200: - yaml_file = response.text - try: - assay_types_dict = yaml.safe_load(yaml_file) - except yaml.YAMLError as e: - raise yaml.YAMLError(e) + assay_types_dict = schema_manager.get_assay_types() hubmap_ids = schema_manager.get_hubmap_ids(id) @@ -3078,8 +3143,6 @@ def get_prov_info_for_dataset(id): first_sample_hubmap_id_list.append(item['hubmap_id']) first_sample_submission_id_list.append(item['submission_id']) first_sample_uuid_list.append(item['uuid']) - - # specimen_type -> sample_category 12/15/2022 first_sample_type_list.append(item['sample_category']) first_sample_portal_url_list.append( @@ -3197,7 +3260,6 @@ def get_prov_info_for_dataset(id): else: requested_samples = {} for uuid in dataset_samples.keys(): - # specimen_type -> sample_category 12/15/2022 if dataset_samples[uuid]['sample_category'] in include_samples: requested_samples[uuid] = dataset_samples[uuid] internal_dict[HEADER_DATASET_SAMPLES] = requested_samples @@ -3251,25 +3313,11 @@ def sankey_data(): mapping_dict = json.load(f) # Parsing the organ types yaml has to be done here rather than calling schema.schema_triggers.get_organ_description # because that would require using a urllib request for each dataset - response = schema_manager.make_request_get(SchemaConstants.ORGAN_TYPES_YAML) - - if response.status_code == 200: - yaml_file = response.text - try: - organ_types_dict = yaml.safe_load(yaml_file) - except yaml.YAMLError as e: - raise yaml.YAMLError(e) + organ_types_dict = schema_manager.get_organ_types() # As above, we parse te assay type yaml here rather than calling the special method for it because this avoids # having to access the resource for every dataset. - response = schema_manager.make_request_get(SchemaConstants.ASSAY_TYPES_YAML) - - if response.status_code == 200: - yaml_file = response.text - try: - assay_types_dict = yaml.safe_load(yaml_file) - except yaml.YAMLError as e: - raise yaml.YAMLError(e) + assay_types_dict = schema_manager.get_assay_types() # Instantiation of the list dataset_sankey_list dataset_sankey_list = [] @@ -3377,14 +3425,16 @@ def get_sample_prov_info(): # Parsing the organ types yaml has to be done here rather than calling schema.schema_triggers.get_organ_description # because that would require using a urllib request for each dataset - response = schema_manager.make_request_get(SchemaConstants.ORGAN_TYPES_YAML) + # response = schema_manager.make_request_get(SchemaConstants.ORGAN_TYPES_YAML) - if response.status_code == 200: - yaml_file = response.text - try: - organ_types_dict = yaml.safe_load(yaml_file) - except yaml.YAMLError as e: - raise yaml.YAMLError(e) + # if response.status_code == 200: + # yaml_file = response.text + # try: + # organ_types_dict = yaml.safe_load(yaml_file) + # except yaml.YAMLError as e: + # raise yaml.YAMLError(e) + + organ_types_dict = schema_manager.get_organ_types() # Processing and validating query parameters accepted_arguments = ['group_uuid'] @@ -3421,7 +3471,6 @@ def get_sample_prov_info(): organ_hubmap_id = sample['organ_hubmap_id'] organ_submission_id = sample['organ_submission_id'] else: - # sample_specimen_type -> sample_category 12/15/2022 if sample['sample_category'] == "organ": organ_uuid = sample['sample_uuid'] organ_type = organ_types_dict[sample['sample_organ']]['description'].lower() @@ -3449,10 +3498,7 @@ def get_sample_prov_info(): internal_dict[HEADER_SAMPLE_HAS_METADATA] = sample_has_metadata internal_dict[HEADER_SAMPLE_HAS_RUI_INFO] = sample_has_rui_info internal_dict[HEADER_SAMPLE_DIRECT_ANCESTOR_ID] = sample['sample_ancestor_id'] - - # sample_specimen_type -> sample_category 12/15/2022 internal_dict[HEADER_SAMPLE_TYPE] = sample['sample_category'] - internal_dict[HEADER_SAMPLE_HUBMAP_ID] = sample['sample_hubmap_id'] internal_dict[HEADER_SAMPLE_SUBMISSION_ID] = sample['sample_submission_id'] internal_dict[HEADER_SAMPLE_DIRECT_ANCESTOR_ENTITY_TYPE] = sample['sample_ancestor_entity'] @@ -4744,34 +4790,18 @@ def access_level_prefix_dir(dir_name): Returns nothing. Raises bad_request_error is organ code not found on organ_types.yaml """ def validate_organ_code(organ_code): - yaml_file_url = SchemaConstants.ORGAN_TYPES_YAML - - # Use Memcached to improve performance - response = schema_manager.make_request_get(yaml_file_url) - - if response.status_code == 200: - yaml_file = response.text + try: + organ_types_dict = schema_manager.get_organ_types() - try: - organ_types_dict = yaml.safe_load(response.text) - - if organ_code.upper() not in organ_types_dict: - bad_request_error(f"Invalid organ code. Must be 2 digit code specified {yaml_file_url}") - except yaml.YAMLError as e: - raise yaml.YAMLError(e) - else: - msg = f"Unable to fetch the: {yaml_file_url}" + if organ_code.upper() not in organ_types_dict: + bad_request_error(f"Invalid organ code. Must be 2 digit code") + except: + msg = f"Failed to validate the organ code: {organ_code}" # Log the full stack trace, prepend a line with our message logger.exception(msg) - logger.debug("======validate_organ_code() status code======") - logger.debug(response.status_code) - - logger.debug("======validate_organ_code() response text======") - logger.debug(response.text) - # Terminate and let the users know - internal_server_error(f"Failed to validate the organ code: {organ_code}") + internal_server_error(msg) #################################################################################################### diff --git a/src/app_neo4j_queries.py b/src/app_neo4j_queries.py index 129f528c..586edf45 100644 --- a/src/app_neo4j_queries.py +++ b/src/app_neo4j_queries.py @@ -300,6 +300,66 @@ def get_sorted_revisions(neo4j_driver, uuid): return results +""" +Get all revisions for a given dataset uuid and sort them in descending order based on their creation time + +Parameters +---------- +neo4j_driver : neo4j.Driver object + The neo4j database connection pool +uuid : str + The uuid of target entity +fetch_all : bool + Whether to fetch all Datasets or only include Published +property_key : str + Return only a particular property from the cypher query, None for return all + +Returns +------- +dict + A multi-dimensional list [prev_revisions>, next_revisions>] +""" + + +def get_sorted_multi_revisions(neo4j_driver, uuid, fetch_all=True, property_key=False): + results = [] + match_case = '' if fetch_all is True else 'AND prev.status = "Published" AND next.status = "Published" ' + collect_prop = f".{property_key}" if property_key else '' + + query = ( + "MATCH (e:Dataset), (next:Dataset), (prev:Dataset)," + f"p = (e)-[:REVISION_OF *0..]->(prev)," + f"n = (e)<-[:REVISION_OF *0..]-(next) " + f"WHERE e.uuid='{uuid}' {match_case}" + "WITH length(p) AS p_len, prev, length(n) AS n_len, next " + "ORDER BY prev.created_timestamp, next.created_timestamp DESC " + f"WITH p_len, collect(distinct prev{collect_prop}) AS prev_revisions, n_len, collect(distinct next{collect_prop}) AS next_revisions " + f"RETURN [collect(distinct next_revisions), collect(distinct prev_revisions)] AS {record_field_name}" + ) + + logger.info("======get_sorted_revisions() query======") + logger.info(query) + + with neo4j_driver.session() as session: + record = session.read_transaction(schema_neo4j_queries.execute_readonly_tx, query) + + if record and record[record_field_name] and len(record[record_field_name]) > 0: + record[record_field_name][0].pop() # the target will appear twice, pop it from the next list + if property_key: + return record[record_field_name] + else: + for collection in record[record_field_name]: # two collections: next, prev + revs = [] + for rev in collection: # each collection list contains revision lists, so 2 dimensional array + # Convert the list of nodes to a list of dicts + nodes_to_dicts = schema_neo4j_queries.nodes_to_dicts(rev) + revs.append(nodes_to_dicts) + + results.append(revs) + + return results + + """ Get all previous revisions of the target entity by uuid @@ -399,6 +459,78 @@ def get_next_revisions(neo4j_driver, uuid, property_key = None): return results +""" +Verifies whether a revisions of a given entity are the last (most recent) revisions. Example: If an entity has a +revision, but that revision also has a revision, return false. + +Parameters +---------- +neo4j_driver : neo4j.Driver object + The neo4j database connection pool +uuid : str + The uuid of target entity + +Returns +------- +bool + Returns true or false whether revisions of the target entity are the latest revisions +""" +def is_next_revision_latest(neo4j_driver, uuid): + results = [] + + query = (f"MATCH (e:Entity)<-[:REVISION_OF*]-(rev:Entity)<-[:REVISION_OF*]-(next:Entity) " + f"WHERE e.uuid='{uuid}' " + # COLLECT() returns a list + # apoc.coll.toSet() reruns a set containing unique nodes + f"RETURN apoc.coll.toSet(COLLECT(next.uuid)) AS {record_field_name}") + + logger.info("======is_next_revision_latest() query======") + logger.info(query) + + with neo4j_driver.session() as session: + record = session.read_transaction(schema_neo4j_queries.execute_readonly_tx, query) + + if record and record[record_field_name]: + results = record[record_field_name] + if results: + return False + else: + return True + + +""" +Verifies that, for a list of previous revision, one or more revisions in the list is itself a revision of another +revision in the list. + +Parameters +---------- +previous_revision_list : list + The list of previous_revision_uuids + +Returns +------- +tuple + The uuid of the first encountered uuid that is a revision of another previous_revision, as well as the uuid that it is a revision of + Else return None +""" +def nested_previous_revisions(neo4j_driver, previous_revision_list): + query = (f"WITH {previous_revision_list} AS uuidList " + "MATCH (ds1:Dataset)-[r:REVISION_OF]->(ds2:Dataset) " + "WHERE ds1.uuid IN uuidList AND ds2.uuid IN uuidList " + "WITH COLLECT(DISTINCT ds1.uuid) AS connectedUUID1, COLLECT(DISTINCT ds2.uuid) as connectedUUID2 " + "RETURN connectedUUID1, connectedUUID2 ") + + logger.info("======nested_previous_revisions() query======") + logger.info(query) + + with neo4j_driver.session() as session: + record = session.read_transaction(schema_neo4j_queries.execute_readonly_tx, query) + if record[0]: + return record + else: + return None + + """ Retrive the full tree above the given entity diff --git a/src/instance/app.cfg.example b/src/instance/app.cfg.example index 0c55f5bd..839972dc 100644 --- a/src/instance/app.cfg.example +++ b/src/instance/app.cfg.example @@ -28,6 +28,10 @@ UUID_API_URL = 'http://uuid-api:8080' # Works regardless of the trailing slash INGEST_API_URL = 'https://ingest-api.dev.hubmapconsortium.org' +# URL for talking to Ontology API (default for DEV) +# Works regardless of the trailing slash +ONTOLOGY_API_URL = 'https://ontology-api.dev.hubmapconsortium.org' + # A list of URLs for talking to multiple Search API instances (default value used for docker deployment, no token needed) # Works regardless of the trailing slash / SEARCH_API_URL_LIST = ['http://search-api:8080'] diff --git a/src/schema/provenance_schema.yaml b/src/schema/provenance_schema.yaml index b72bf3a1..9b599776 100644 --- a/src/schema/provenance_schema.yaml +++ b/src/schema/provenance_schema.yaml @@ -418,7 +418,9 @@ ENTITIES: description: "The displayname of globus group which the user who created this entity is a member of" before_create_trigger: set_group_name #same as group_uuid, except set group_name previous_revision_uuid: - type: string + type: + - string + - list transient: true immutable: true description: "The uuid of previous revision dataset" @@ -645,7 +647,9 @@ ENTITIES: description: "The displayname of globus group which the user who created this entity is a member of" before_create_trigger: set_group_name #same as group_uuid, except set group_name previous_revision_uuid: - type: string + type: + - string + - list transient: true immutable: true description: "The uuid of previous revision dataset" @@ -897,33 +901,6 @@ ENTITIES: - validate_sample_category before_property_update_validators: - validate_sample_category - - # No logner required on create, specimen_type -> sample_category 12/15/2022 - specimen_type: - type: string - #required_on_create: true # Only required for create via POST, not update via PUT - description: "A code representing the type of specimen. Must be one of the codes specified in: [tissue sample types](https://github.com/hubmapconsortium/search-api/blob/main/src/search-schema/data/definitions/enums/tissue_sample_types.yaml)" - # Validate the given value against the definitions: https://github.com/hubmapconsortium/search-api/blob/main/src/search-schema/data/definitions/enums/tissue_sample_types.yaml - # Disabled validation 12/15/2022 - # before_property_create_validators: - # - validate_specimen_type - # before_property_update_validators: - # - validate_specimen_type - specimen_type_other: - type: string - description: "The user provided sample type if the 'other' sample_type is chosen." - - - # specimen_type no logner required on create, will remove this field when removing specimen_type - # Simply always set to 'Unknown' and no need to update 12/15/2022 - tissue_type: - type: string - generated: true # Can not be updated via the PUT - #auto_update: true # Will always update automatically if the entity gets updated - description: 'The type of the tissue based on the mapping between type (Block/Section/Suspension) and the specimen_type, default is Unknown' - before_create_trigger: set_tissue_type - #before_update_trigger: set_tissue_type - portal_metadata_upload_files: type: json_string description: "A list of relative paths to metadata files" @@ -949,7 +926,6 @@ ENTITIES: immutable: true description: "The displayname of globus group which the user who created this entity is a member of" before_create_trigger: set_group_name - # Should be required on create only when specimen_type==organ organ: type: string description: "Organ code specifier, only set if sample_type == organ. Valid values found in: [organ types](https://github.com/hubmapconsortium/search-api/blob/main/src/search-schema/data/definitions/enums/organ_types.yaml)" diff --git a/src/schema/schema_constants.py b/src/schema/schema_constants.py index 5e4ad332..e34bd10a 100644 --- a/src/schema/schema_constants.py +++ b/src/schema/schema_constants.py @@ -2,23 +2,20 @@ class SchemaConstants(object): MEMCACHED_TTL = 7200 - # Constants used by validators INGEST_API_APP = 'ingest-api' INGEST_PIPELINE_APP = 'ingest-pipeline' HUBMAP_APP_HEADER = 'X-Hubmap-Application' DATASET_STATUS_PUBLISHED = 'published' - # Used by triggers, all lowercase for easy comparision ACCESS_LEVEL_PUBLIC = 'public' ACCESS_LEVEL_CONSORTIUM = 'consortium' ACCESS_LEVEL_PROTECTED = 'protected' - # Yaml file to parse organ description - ORGAN_TYPES_YAML = 'https://raw.githubusercontent.com/hubmapconsortium/search-api/main/src/search-schema/data/definitions/enums/organ_types.yaml' - ASSAY_TYPES_YAML = 'https://raw.githubusercontent.com/hubmapconsortium/search-api/main/src/search-schema/data/definitions/enums/assay_types.yaml' - - # For generating Sample.tissue_type - TISSUE_TYPES_YAML = 'https://raw.githubusercontent.com/hubmapconsortium/search-api/main/src/search-schema/data/definitions/enums/tissue_sample_types.yaml' + UUID_API_ID_ENDPOINT = '/uuid' + INGEST_API_FILE_COMMIT_ENDPOINT = '/file-commit' + INGEST_API_FILE_REMOVE_ENDPOINT = '/file-remove' + ONTOLOGY_API_ASSAY_TYPES_ENDPOINT = '/assaytype?application_context=HuBMAP' + ONTOLOGY_API_ORGAN_TYPES_ENDPOINT = '/organs?application_context=HuBMAP' DOI_BASE_URL = 'https://doi.org/' diff --git a/src/schema/schema_manager.py b/src/schema/schema_manager.py index 2386b013..cf247934 100644 --- a/src/schema/schema_manager.py +++ b/src/schema/schema_manager.py @@ -50,9 +50,11 @@ valid_yaml_file : file A valid yaml file uuid_api_url : str - The uuid-api URL + The uuid-api base URL ingest_api_url : str - The ingest-api URL + The ingest-api base URL +ontology_api_url : str + The ontology-api base URL auth_helper_instance : AuthHelper The auth helper instance neo4j_driver_instance : neo4j_driver @@ -65,6 +67,7 @@ def initialize(valid_yaml_file, uuid_api_url, ingest_api_url, + ontology_api_url, auth_helper_instance, neo4j_driver_instance, memcached_client_instance, @@ -73,6 +76,7 @@ def initialize(valid_yaml_file, global _schema global _uuid_api_url global _ingest_api_url + global _ontology_api_url global _auth_helper global _neo4j_driver global _memcached_client @@ -81,6 +85,7 @@ def initialize(valid_yaml_file, _schema = load_provenance_schema(valid_yaml_file) _uuid_api_url = uuid_api_url _ingest_api_url = ingest_api_url + _ontology_api_url = ontology_api_url # Get the helper instances _auth_helper = auth_helper_instance @@ -784,9 +789,13 @@ def validate_json_data_against_schema(json_data_dict, normalized_entity_type, ex invalid_data_type_keys = [] for key in json_data_keys: # boolean starts with bool, string starts with str, integer starts with int, list is list - if (properties[key]['type'] in ['string', 'integer', 'list', 'boolean']) and (not properties[key]['type'].startswith(type(json_data_dict[key]).__name__)): - invalid_data_type_keys.append(key) - + property_type = properties[key]['type'] + if isinstance(property_type, str): + if (property_type in ['string', 'integer', 'list', 'boolean']) and (not property_type.startswith(type(json_data_dict[key]).__name__)): + invalid_data_type_keys.append(key) + elif isinstance(property_type, list): + if not any(item.startswith(type(json_data_dict[key]).__name__) for item in property_type): + invalid_data_type_keys.append(key) # Handling json_string as dict if (properties[key]['type'] == 'json_string') and (not isinstance(json_data_dict[key], dict)): invalid_data_type_keys.append(key) @@ -1202,7 +1211,7 @@ def get_user_info(request): def get_hubmap_ids(id): global _uuid_api_url - target_url = _uuid_api_url + '/uuid/' + id + target_url = _uuid_api_url + SchemaConstants.UUID_API_ID_ENDPOINT + '/' + id # Use Memcached to improve performance response = make_request_get(target_url, internal_token_used = True) @@ -1347,7 +1356,6 @@ def create_hubmap_ids(normalized_class, json_data_dict, user_token, user_info_di parent_id = json_data_dict['direct_ancestor_uuid'] json_to_post['parent_ids'] = [parent_id] - # specimen_type -> sample_category 12/15/2022 # 'Sample.sample_category' is marked as `required_on_create` in the schema yaml if json_data_dict['sample_category'].lower() == 'organ': # The 'organ' field containing the 2 digit organ code is required in this case @@ -1365,7 +1373,7 @@ def create_hubmap_ids(normalized_class, json_data_dict, user_token, user_info_di logger.info(json_to_post) # Disable ssl certificate verification - target_url = _uuid_api_url + '/uuid' + target_url = _uuid_api_url + SchemaConstants.UUID_API_ID_ENDPOINT response = requests.post(url = target_url, headers = request_headers, json = json_to_post, verify = False, params = query_parms) # Invoke .raise_for_status(), an HTTPError will be raised with certain status codes @@ -1764,6 +1772,84 @@ def delete_memcached_cache(uuids_list): logger.info(f"Deleted cache by key: {', '.join(cache_keys)}") +""" +Retrive the organ types from ontology-api + +Returns +------- +dict + The available organ types +""" +def get_organ_types(): + global _ontology_api_url + + target_url = _ontology_api_url + '/organs?application_context=HuBMAP' + + # Use Memcached to improve performance + response = make_request_get(target_url, internal_token_used = True) + + # Invoke .raise_for_status(), an HTTPError will be raised with certain status codes + response.raise_for_status() + + if response.status_code == 200: + ids_dict = response.json() + return ids_dict + else: + # uuid-api will also return 400 if the given id is invalid + # We'll just hanle that and all other cases all together here + msg = f"Unable to make a request to query the id via uuid-api: {id}" + # Log the full stack trace, prepend a line with our message + logger.exception(msg) + + logger.debug("======get_organ_types() status code from ontology-api======") + logger.debug(response.status_code) + + logger.debug("======get_organ_types() response text from ontology-api======") + logger.debug(response.text) + + # Also bubble up the error message from ontology-api + raise requests.exceptions.RequestException(response.text) + + +""" +Retrive the assay types from ontology-api + +Returns +------- +dict + The available assay types +""" +def get_assay_types(): + global _ontology_api_url + + target_url = _ontology_api_url + '/assaytype?application_context=HuBMAP' + + # Use Memcached to improve performance + response = make_request_get(target_url, internal_token_used = True) + + # Invoke .raise_for_status(), an HTTPError will be raised with certain status codes + response.raise_for_status() + + if response.status_code == 200: + ids_dict = response.json() + return ids_dict + else: + # uuid-api will also return 400 if the given id is invalid + # We'll just hanle that and all other cases all together here + msg = f"Unable to make a request to query the id via uuid-api: {id}" + # Log the full stack trace, prepend a line with our message + logger.exception(msg) + + logger.debug("======get_assay_types() status code from ontology-api======") + logger.debug(response.status_code) + + logger.debug("======get_assay_types() response text from ontology-api======") + logger.debug(response.text) + + # Also bubble up the error message from ontology-api + raise requests.exceptions.RequestException(response.text) + + #################################################################################################### ## Internal functions #################################################################################################### diff --git a/src/schema/schema_neo4j_queries.py b/src/schema/schema_neo4j_queries.py index b23d33ba..231375b3 100644 --- a/src/schema/schema_neo4j_queries.py +++ b/src/schema/schema_neo4j_queries.py @@ -442,24 +442,8 @@ def get_dataset_organ_and_donor_info(neo4j_driver, uuid): donor_metadata = None with neo4j_driver.session() as session: - # Old time-consuming single query, it takes a significant amounts of DB hits - # query = (f"MATCH (e:Dataset)<-[:ACTIVITY_INPUT|ACTIVITY_OUTPUT*]-(s:Sample)<-[:ACTIVITY_INPUT|ACTIVITY_OUTPUT*]-(d:Donor) " - # f"WHERE e.uuid='{uuid}' AND s.specimen_type='organ' AND EXISTS(s.organ) " - # f"RETURN s.organ AS organ_name, d.metadata AS donor_metadata") - - # logger.info("======get_dataset_organ_and_donor_info() query======") - # logger.info(query) - - # with neo4j_driver.session() as session: - # record = session.read_transaction(execute_readonly_tx, query) - - # if record: - # organ_name = record['organ_name'] - # donor_metadata = record['donor_metadata'] - # To improve the query performance, we implement the two-step queries to drastically reduce the DB hits sample_query = (f"MATCH (e:Dataset)<-[:ACTIVITY_INPUT|ACTIVITY_OUTPUT*]-(s:Sample) " - # specimen_type -> sample_category 12/15/2022 f"WHERE e.uuid='{uuid}' AND s.sample_category='organ' AND EXISTS(s.organ) " f"RETURN DISTINCT s.organ AS organ_name, s.uuid AS sample_uuid") @@ -473,7 +457,6 @@ def get_dataset_organ_and_donor_info(neo4j_driver, uuid): sample_uuid = sample_record['sample_uuid'] donor_query = (f"MATCH (s:Sample)<-[:ACTIVITY_OUTPUT]-(a:Activity)<-[:ACTIVITY_INPUT]-(d:Donor) " - # specimen_type -> sample_category 12/15/2022 f"WHERE s.uuid='{sample_uuid}' AND s.sample_category='organ' AND EXISTS(s.organ) " f"RETURN DISTINCT d.metadata AS donor_metadata") @@ -652,14 +635,13 @@ def link_collection_to_datasets(neo4j_driver, collection_uuid, dataset_uuid_list previous_revision_entity_uuid : str The uuid of previous revision entity """ -def link_entity_to_previous_revision(neo4j_driver, entity_uuid, previous_revision_entity_uuid): +def link_entity_to_previous_revision(neo4j_driver, entity_uuid, previous_revision_entity_uuids): try: with neo4j_driver.session() as session: tx = session.begin_transaction() - - # Create relationship from ancestor entity node to this Activity node - create_relationship_tx(tx, entity_uuid, previous_revision_entity_uuid, 'REVISION_OF', '->') - + for previous_uuid in previous_revision_entity_uuids: + # Create relationship from ancestor entity node to this Activity node + create_relationship_tx(tx, entity_uuid, previous_uuid, 'REVISION_OF', '->') tx.commit() except TransactionError as te: msg = "TransactionError from calling link_entity_to_previous_revision(): " diff --git a/src/schema/schema_triggers.py b/src/schema/schema_triggers.py index 38cf61dc..182e5614 100644 --- a/src/schema/schema_triggers.py +++ b/src/schema/schema_triggers.py @@ -940,26 +940,36 @@ def get_local_directory_rel_path(property_key, normalized_type, user_token, exis A merged dictionary that contains all possible input data to be used """ def link_to_previous_revision(property_key, normalized_type, user_token, existing_data_dict, new_data_dict): - if 'uuid' not in existing_data_dict: - raise KeyError("Missing 'uuid' key in 'existing_data_dict' during calling 'link_to_previous_revision()' trigger method.") + try: + if 'uuid' not in existing_data_dict: + raise KeyError("Missing 'uuid' key in 'existing_data_dict' during calling 'link_to_previous_revision()' trigger method.") - if 'previous_revision_uuid' not in existing_data_dict: - raise KeyError("Missing 'previous_revision_uuid' key in 'existing_data_dict' during calling 'link_to_previous_revision()' trigger method.") + if 'previous_revision_uuid' not in existing_data_dict: + raise KeyError("Missing 'previous_revision_uuid' key in 'existing_data_dict' during calling 'link_to_previous_revision()' trigger method.") - entity_uuid = existing_data_dict['uuid'] - previous_uuid = existing_data_dict['previous_revision_uuid'] + entity_uuid = existing_data_dict['uuid'] + if isinstance(existing_data_dict['previous_revision_uuid'], list): + previous_uuid = existing_data_dict['previous_revision_uuid'] + else: + previous_uuid = [existing_data_dict['previous_revision_uuid']] - # Create a revision reltionship from this new Dataset node and its previous revision of dataset node in neo4j - try: - schema_neo4j_queries.link_entity_to_previous_revision(schema_manager.get_neo4j_driver_instance(), entity_uuid, previous_uuid) - - # Delete the cache of each associated dataset if any cache exists - # Because the `Dataset.previous_revision_uuid` and `Dataset.next_revision_uuid` fields - uuids_list = [entity_uuid, previous_uuid] - schema_manager.delete_memcached_cache(uuids_list) - except TransactionError: - # No need to log - raise + # Create a revision reltionship from this new Dataset node and its previous revision of dataset node in neo4j + try: + schema_neo4j_queries.link_entity_to_previous_revision(schema_manager.get_neo4j_driver_instance(), entity_uuid, previous_uuid) + + # Delete the cache of each associated dataset if any cache exists + # Because the `Dataset.previous_revision_uuid` and `Dataset.next_revision_uuid` fields + uuids_list = [entity_uuid] + if isinstance(previous_uuid, list): + uuids_list.extend(previous_uuid) + else: + uuids_list.append(previous_uuid) + schema_manager.delete_memcached_cache(uuids_list) + except TransactionError: + # No need to log + raise + except Exception as e: + raise KeyError(e) """ Trigger event method of auto generating the dataset title @@ -1184,7 +1194,7 @@ def commit_thumbnail_file(property_key, normalized_type, user_token, existing_da entity_uuid = existing_data_dict['uuid'] # Commit the thumbnail file via ingest-api call - ingest_api_target_url = schema_manager.get_ingest_api_url() + '/file-commit' + ingest_api_target_url = schema_manager.get_ingest_api_url() + SchemaConstants.INGEST_API_FILE_COMMIT_ENDPOINT # Example: {"temp_file_id":"dzevgd6xjs4d5grmcp4n"} thumbnail_file_dict = new_data_dict[property_key] @@ -1286,7 +1296,7 @@ def delete_thumbnail_file(property_key, normalized_type, user_token, existing_da file_info_dict = generated_dict[target_property_key] # Remove the thumbnail file via ingest-api call - ingest_api_target_url = schema_manager.get_ingest_api_url() + '/file-remove' + ingest_api_target_url = schema_manager.get_ingest_api_url() + SchemaConstants.INGEST_API_FILE_REMOVE_ENDPOINT # ingest-api's /file-remove takes a list of files to remove # In this case, we only need to remove the single thumbnail file @@ -1562,105 +1572,6 @@ def get_sample_direct_ancestor(property_key, normalized_type, user_token, existi return property_key, schema_manager.normalize_entity_result_for_response(direct_ancestor_dict) -""" -Trigger event method of generating the type of the tissue based on the mapping between type (Block/Section/Suspension) and the specimen_type -This method applies to both the create and update triggers - -Rererence: - - https://docs.google.com/spreadsheets/d/1OODo8QK852txSNSmfIe0ua4A7nPFSgKq6h46grmrpto/edit#gid=0 - - https://github.com/hubmapconsortium/search-api/blob/main/src/search-schema/data/definitions/enums/tissue_sample_types.yaml - -Parameters ----------- -property_key : str - The target property key of the value to be generated -normalized_type : str - One of the types defined in the schema yaml: Sample -user_token: str - The user's globus nexus token -existing_data_dict : dict - A dictionary that contains all existing entity properties -new_data_dict : dict - A merged dictionary that contains all possible input data to be used - -Returns -------- -str: The target property key -str: The type of the tissue -""" -def set_tissue_type(property_key, normalized_type, user_token, existing_data_dict, new_data_dict): - # specimen_type is no logner required on create 12/15/2022, set to Unknown - # Default to use 'Unknown' - tissue_type = 'Unknown' - - # # The `specimen_type` field is required on entity creation via POST - # # thus should be available on existing entity update via PUT - # # We do a double check here just in case - # if ('specimen_type' not in new_data_dict) and ('specimen_type' not in existing_data_dict): - # raise KeyError("Missing 'specimen_type' key in both 'new_data_dict' and 'existing_data_dict' during calling 'set_tissue_type()' trigger method.") - - # # Always calculate the tissue_type value no matter new creation or update existing - # # The `specimen_type` field can be used in a PUT - # # But if it's not in the request JSON of a PUT, it must be in the existing data - # if 'specimen_type' in new_data_dict: - # # The `specimen_type` value validation is handled in the `schema_validators.validate_specimen_type()` - # # and that gets called before this trigger method - # specimen_type = new_data_dict['specimen_type'].lower() - # else: - # # Use lowercase in case someone manually updated the neo4j filed with incorrect case - # specimen_type = existing_data_dict['specimen_type'].lower() - - # # Categories: Block, Section, Suspension - # block_category = [ - # 'pbmc', - # 'biopsy', - # 'segment', - # 'ffpe_block', - # 'organ_piece', - # 'fresh_tissue', - # 'clarity_hydrogel', - # 'fixed_tissue_piece', - # 'fresh_frozen_tissue', - # 'fresh_frozen_oct_block', - # 'formalin_fixed_oct_block', - # 'pfa_fixed_frozen_oct_block', - # 'flash_frozen_liquid_nitrogen', - # 'frozen_cell_pellet_buffy_coat' - # ] - - # section_category = [ - # 'ffpe_slide', - # 'fixed_frozen_section_slide', - # 'fresh_frozen_section_slide', - # 'fresh_frozen_tissue_section', - # 'cryosections_curls_rnalater', - # 'cryosections_curls_from_fresh_frozen_oct' - # ] - - # suspension_category = [ - # 'gdna', - # 'serum', - # 'plasma', - # 'nuclei', - # 'protein', - # 'rna_total', - # 'cell_lysate', - # 'tissue_lysate', - # 'sequence_library', - # 'ran_poly_a_enriched', - # 'single_cell_cryopreserved' - # ] - - # # Capitalized type, default is 'Unknown' if no match - # if specimen_type in block_category: - # tissue_type = 'Block' - # elif specimen_type in section_category: - # tissue_type = 'Section' - # elif specimen_type in suspension_category: - # tissue_type = 'Suspension' - - return property_key, tissue_type - #################################################################################################### ## Trigger methods specific to Publication - DO NOT RENAME @@ -1994,7 +1905,7 @@ def _commit_files(target_property_key, property_key, normalized_type, user_token entity_uuid = existing_data_dict['uuid'] # Commit the files via ingest-api call - ingest_api_target_url = schema_manager.get_ingest_api_url() + '/file-commit' + ingest_api_target_url = schema_manager.get_ingest_api_url() + SchemaConstants.INGEST_API_FILE_COMMIT_ENDPOINT for file_info in new_data_dict[property_key]: temp_file_id = file_info['temp_file_id'] @@ -2104,7 +2015,7 @@ def _delete_files(target_property_key, property_key, normalized_type, user_token file_uuids.append(file_uuid) # Remove the files via ingest-api call - ingest_api_target_url = schema_manager.get_ingest_api_url() + '/file-remove' + ingest_api_target_url = schema_manager.get_ingest_api_url() + SchemaConstants.INGEST_API_FILE_REMOVE_ENDPOINT json_to_post = { 'entity_uuid': entity_uuid, @@ -2143,39 +2054,10 @@ def _delete_files(target_property_key, property_key, normalized_type, user_token str: The corresponding assay type description """ def _get_assay_type_description(assay_type): - yaml_file_url = SchemaConstants.ASSAY_TYPES_YAML - - # Use Memcached to improve performance - response = schema_manager.make_request_get(yaml_file_url) - - if response.status_code == 200: - yaml_file = response.text - - try: - assay_types_dict = yaml.safe_load(response.text) - - if assay_type in assay_types_dict: - return assay_types_dict[assay_type]['description'].lower() - else: - # Check the 'alt-names' list if not found in the top-level keys - for key in assay_types_dict: - if assay_type in assay_types_dict[key]['alt-names']: - return assay_types_dict[key]['description'].lower() - except yaml.YAMLError as e: - raise yaml.YAMLError(e) - else: - msg = f"Unable to fetch the: {yaml_file_url}" - # Log the full stack trace, prepend a line with our message - logger.exception(msg) - - logger.debug("======_get_assay_type_description() status code======") - logger.debug(response.status_code) - - logger.debug("======_get_assay_type_description() response text======") - logger.debug(response.text) + assay_types_dict = schema_manager.get_assay_types() - # Also bubble up the error message - raise requests.exceptions.RequestException(response.text) + if assay_type in assay_types_dict: + return assay_types_dict[assay_type]['description'].lower() """ @@ -2230,32 +2112,7 @@ def _get_combined_assay_type_description(data_types): str: The organ code description """ def _get_organ_description(organ_code): - yaml_file_url = SchemaConstants.ORGAN_TYPES_YAML - - # Use Memcached to improve performance - response = schema_manager.make_request_get(yaml_file_url) - - if response.status_code == 200: - yaml_file = response.text - - try: - organ_types_dict = yaml.safe_load(response.text) - return organ_types_dict[organ_code]['description'].lower() - except yaml.YAMLError as e: - raise yaml.YAMLError(e) - else: - msg = f"Unable to fetch the: {yaml_file_url}" - # Log the full stack trace, prepend a line with our message - logger.exception(msg) - - logger.debug("======_get_organ_description() status code======") - logger.debug(response.status_code) - - logger.debug("======_get_organ_description() response text======") - logger.debug(response.text) - - # Also bubble up the error message - raise requests.exceptions.RequestException(response.text) - + organ_types_dict = schema_manager.get_organ_types() + return organ_types_dict[organ_code]['description'].lower() diff --git a/src/schema/schema_validators.py b/src/schema/schema_validators.py index 8c1269a5..cedd2d53 100644 --- a/src/schema/schema_validators.py +++ b/src/schema/schema_validators.py @@ -430,33 +430,6 @@ def validate_upload_status_value(property_key, normalized_entity_type, request, raise ValueError(f"Invalid status value: {new_status}") -""" -NOTE: TO BE REMOVED when we remove specimen_type field - -Validate the provided value of Sample.specimen_type on create via POST and update via PUT - -Parameters ----------- -property_key : str - The target property key -normalized_type : str - Submission -request: Flask request object - The instance of Flask request passed in from application request -existing_data_dict : dict - A dictionary that contains all existing entity properties -new_data_dict : dict - The json data in request body, already after the regular validations -""" -def validate_specimen_type(property_key, normalized_entity_type, request, existing_data_dict, new_data_dict): - # Use lowercase for comparison - defined_tissue_types = _get_tissue_types() - specimen_type = new_data_dict[property_key].lower() - - if specimen_type not in defined_tissue_types: - raise ValueError(f"Invalid specimen_type value: {specimen_type}") - - """ Validate the provided value of Sample.sample_category on create via POST and update via PUT @@ -562,47 +535,3 @@ def _validate_application_header(applications_allowed, request_headers): msg = f"Unable to proceed due to invalid {SchemaConstants.HUBMAP_APP_HEADER} header value: {app_header}" raise schema_errors.InvalidApplicationHeaderException(msg) - -""" -Get the complete list of defined tissue types - -Returns -------- -list: The list of defined tissue types -""" -def _get_tissue_types(): - yaml_file_url = SchemaConstants.TISSUE_TYPES_YAML - - # Use Memcached to improve performance - response = schema_manager.make_request_get(yaml_file_url) - - if response.status_code == 200: - yaml_file = response.text - - try: - tissue_types_dict = yaml.safe_load(response.text) - - # We don't need the description here, just a list of tissue types - # Note: dict.keys() returns a dict, need to typecast to list - tissue_types_list = list(tissue_types_dict.keys()) - - # Add the 'other' - tissue_types_list.append('other') - - return tissue_types_list - except yaml.YAMLError as e: - raise yaml.YAMLError(e) - else: - msg = f"Unable to fetch the: {yaml_file_url}" - # Log the full stack trace, prepend a line with our message - logger.exception(msg) - - logger.debug("======_get_tissue_types() status code======") - logger.debug(response.status_code) - - logger.debug("======_get_tissue_types() response text======") - logger.debug(response.text) - - # Also bubble up the error message - raise requests.exceptions.RequestException(response.text) - diff --git a/src/schema_templating/example-yaml-templates/api-template-test/entity-Template.yaml b/src/schema_templating/example-yaml-templates/api-template-test/entity-Template.yaml index 5a6739f7..ab8ec463 100644 --- a/src/schema_templating/example-yaml-templates/api-template-test/entity-Template.yaml +++ b/src/schema_templating/example-yaml-templates/api-template-test/entity-Template.yaml @@ -398,60 +398,6 @@ x-ref-components: - consortium - public description: "One of the values: public, consortium." - specimen_type: - type: string - enum: - - atacseq - - biopsy - - blood - - cell_lysate - - clarity_hydrogel - - codex - - cryosections_curls_from_fresh_frozen_oct - - cryosections_curls_rnalater - - ffpe_block - - ffpe_slide - - fixed_frozen_section_slide - - fixed_tissue_piece - - flash_frozen_liquid_nitrogen - - formalin_fixed_oct_block - - fresh_frozen_oct_block - - fresh_frozen_section_slide - - fresh_frozen_tissue - - fresh_frozen_tissue_section - - fresh_tissue - - frozen_cell_pellet_buffy_coat - - gdna - - module - - nuclei - - nuclei_rnalater - - organ - - organ_piece - - other - - pbmc - - pfa_fixed_frozen_oct_block - - plasma - - protein - - ran_poly_a_enriched - - rna_total - - rnalater_treated_and_stored - - rnaseq - - scatacseq - - scrnaseq - - segment - - seqfish - - sequence_library - - serum - - sequence_library - - single_cell_cryopreserved - - snatacseq - - snrnaseq - - tissue_lysate - - wgs - description: "A code representing the type of specimen. Must be one of the codes specified in: [tissue sample types](https://github.com/hubmapconsortium/search-api/blob/main/src/search-schema/data/definitions/enums/tissue_sample_types.yaml)" - specimen_type_other: - type: string - description: "The user provided sample type if the 'other' sample_type is chosen." protocol_url: type: string description: "The protocols.io doi url pointing the protocol under wich the sample was obtained and/or prepared." diff --git a/src/schema_templating/example-yaml-templates/sample-schema.yaml b/src/schema_templating/example-yaml-templates/sample-schema.yaml index 1f5ee751..2b6f3f11 100644 --- a/src/schema_templating/example-yaml-templates/sample-schema.yaml +++ b/src/schema_templating/example-yaml-templates/sample-schema.yaml @@ -72,15 +72,6 @@ Sample: - consortium - public description: "One of the values: public, consortium." - specimen_type: - type: string - enum: - X-replace-enum-list: - enum-file-ref: https://raw.githubusercontent.com/hubmapconsortium/search-api/main/src/search-schema/data/definitions/enums/tissue_sample_types.yaml - description: "A code representing the type of specimen. Must be one of the codes specified in: [tissue sample types](https://github.com/hubmapconsortium/search-api/blob/main/src/search-schema/data/definitions/enums/tissue_sample_types.yaml)" - specimen_type_other: - type: string - description: "The user provided sample type if the 'other' sample_type is chosen." protocol_url: type: string description: "The protocols.io doi url pointing the protocol under wich the sample was obtained and/or prepared."