Skip to content

Commit

Permalink
Merge pull request #682 from biolink/issue-380-fix-gpi
Browse files Browse the repository at this point in the history
fixes pipeline issue 380 by editing the header, taxon, type of the GP…
  • Loading branch information
sierra-moxon authored Jul 24, 2024
2 parents 5f865b1 + 12ed886 commit df8f688
Show file tree
Hide file tree
Showing 7 changed files with 5,791 additions and 4,804 deletions.
5 changes: 3 additions & 2 deletions ontobio/io/entityparser.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,10 +122,10 @@ def gpi_version(self) -> str:
else:
return self.default_version

def parse_line(self, line):
def parse_line(self, line) -> (str, List[Dict]):
"""Parses a single line of a GPI.
Return a tuple `(processed_line, entities)`. Typically
Return a tuple `(processed_line, entities)`. Typically,
there will be a single entity, but in some cases there
may be none (invalid line) or multiple (disjunctive clause in
annotation extensions)
Expand Down Expand Up @@ -301,6 +301,7 @@ def line_as_entity_subject(self, line: str):
for entity in entity_dicts:
entity_types = []
if self.gpi_version() == "2.0":

entity_types = [association.Curie.from_str(t) for t in entity["type"]]
if any(c.is_error() for c in entity_types):
logger.error("Skipping `{}` due to malformed CURIE in entity type: `{}`".format(line, entity["type"]))
Expand Down
34 changes: 21 additions & 13 deletions ontobio/io/entitywriter.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,12 @@
"""
Classes for exporting entities.
So far only one implementation
"""
"""Classes for exporting entities."""
import re
from datetime import datetime

from ontobio.model.association import map_gp_type_label_to_curie

external_taxon = re.compile("taxon:([0-9]+)")
internal_taxon = re.compile("NCBITaxon:([0-9]+)")


def stringify(s):
if s is None:
Expand All @@ -13,8 +16,6 @@ def stringify(s):
else:
return s

external_taxon = re.compile("taxon:([0-9]+)")
internal_taxon = re.compile("NCBITaxon:([0-9]+)")

def normalize_taxon(taxon):
global internal_taxon
Expand Down Expand Up @@ -100,6 +101,8 @@ def __init__(self, file=None, version=None):
if self.file:
if self.version == "2.0":
self.file.write("!gpi-version: 2.0\n")
self.file.write("!date_generated: " + datetime.now().strftime("%Y-%m-%dT%H:%M") + "\n")
self.file.write("!generated_by: GO Central\n")
else:
self.file.write("!gpi-version: 1.2\n")

Expand Down Expand Up @@ -140,14 +143,19 @@ def write_entity(self, entity):
"""

taxon = entity.get("taxon").get("id")
if normalize_taxon(taxon).startswith("taxon:"):
taxon = taxon.replace("taxon:", "NCBITaxon:")

if self.version == "2.0":
vals = [
entity.get('id'), # DB_Object_ID
entity.get('label'), # DB_Object_symbol
entity.get('full_name'), # DB_Object_Name
entity.get('synonyms'), # DB_Object_Synonyms
entity.get('type'), # DB_Object_Type
normalize_taxon(entity.get("taxon").get("id")), # DB_Object_Taxon
# GPI spec says this is single valued, GpiParser returns list, so take the first element here.
str(map_gp_type_label_to_curie(entity.get('type')[0])), # DB_Object_Type to curie vs. label
taxon, # DB_Object_Taxon, normalized to NCBITaxon prefix
"", # Encoded_by
entity.get('parents'), # Parent_Protein
"", # Protein_Containing_Complex_Members
Expand All @@ -160,10 +168,10 @@ def write_entity(self, entity):
prefix, # DB
local_id, # DB_Object_ID
entity.get('label'), # DB_Object_Symbol
entity.get('full_name'), # DB_Object_Symbol
entity.get('synonyms'), # DB_Object_Name
entity.get('type'), # DB_Object_Synonyms
normalize_taxon(entity.get("taxon").get("id")), # taxon
entity.get('full_name'), # DB_Object_Full_Name
entity.get('synonyms'), # DB_Object_Synonyms
entity.get('type'), # DB_Object_Type
normalize_taxon(entity.get("taxon").get("id")), # taxon in gpi 1.2 was prefixed by `taxon:`
entity.get('parents'), # Parent_Object_ID
entity.get('xrefs'), # DB_Xref(s)
entity.get('properties') # Properties
Expand Down
Loading

0 comments on commit df8f688

Please sign in to comment.