Merge pull request #682 from biolink/issue-380-fix-gpi

fixes pipeline issue 380 by editing the header, taxon, type of the GP…
biolink · Jul 24, 2024 · df8f688 · df8f688
2 parents 5f865b1 + 12ed886
commit df8f688
Show file tree

Hide file tree

Showing 7 changed files with 5,791 additions and 4,804 deletions.
diff --git a/ontobio/io/entityparser.py b/ontobio/io/entityparser.py
@@ -122,10 +122,10 @@ def gpi_version(self) -> str:
         else:
             return self.default_version
 
-    def parse_line(self, line):
+    def parse_line(self, line) -> (str, List[Dict]):
         """Parses a single line of a GPI.
 
-        Return a tuple `(processed_line, entities)`. Typically
+        Return a tuple `(processed_line, entities)`. Typically,
         there will be a single entity, but in some cases there
         may be none (invalid line) or multiple (disjunctive clause in
         annotation extensions)
@@ -301,6 +301,7 @@ def line_as_entity_subject(self, line: str):
         for entity in entity_dicts:
             entity_types = []
             if self.gpi_version() == "2.0":
+
                 entity_types = [association.Curie.from_str(t) for t in entity["type"]]
                 if any(c.is_error() for c in entity_types):
                     logger.error("Skipping `{}` due to malformed CURIE in entity type: `{}`".format(line, entity["type"]))

diff --git a/ontobio/io/entitywriter.py b/ontobio/io/entitywriter.py
@@ -1,9 +1,12 @@
-"""
-Classes for exporting entities.
-
-So far only one implementation
-"""
+"""Classes for exporting entities."""
 import re
+from datetime import datetime
+
+from ontobio.model.association import map_gp_type_label_to_curie
+
+external_taxon = re.compile("taxon:([0-9]+)")
+internal_taxon = re.compile("NCBITaxon:([0-9]+)")
+
 
 def stringify(s):
     if s is None:
@@ -13,8 +16,6 @@ def stringify(s):
     else:
         return s
 
-external_taxon = re.compile("taxon:([0-9]+)")
-internal_taxon = re.compile("NCBITaxon:([0-9]+)")
 
 def normalize_taxon(taxon):
     global internal_taxon
@@ -100,6 +101,8 @@ def __init__(self, file=None, version=None):
         if self.file:
             if self.version == "2.0":
                 self.file.write("!gpi-version: 2.0\n")
+                self.file.write("!date_generated: " + datetime.now().strftime("%Y-%m-%dT%H:%M") + "\n")
+                self.file.write("!generated_by: GO Central\n")
             else:
                 self.file.write("!gpi-version: 1.2\n")
 
@@ -140,14 +143,19 @@ def write_entity(self, entity):
 
         """
 
+        taxon = entity.get("taxon").get("id")
+        if normalize_taxon(taxon).startswith("taxon:"):
+            taxon = taxon.replace("taxon:", "NCBITaxon:")
+
         if self.version == "2.0":
             vals = [
                 entity.get('id'),  # DB_Object_ID
                 entity.get('label'),  # DB_Object_symbol
                 entity.get('full_name'),  # DB_Object_Name
                 entity.get('synonyms'),  # DB_Object_Synonyms
-                entity.get('type'),  # DB_Object_Type
-                normalize_taxon(entity.get("taxon").get("id")),  # DB_Object_Taxon
+                # GPI spec says this is single valued, GpiParser returns list, so take the first element here.
+                str(map_gp_type_label_to_curie(entity.get('type')[0])),  # DB_Object_Type to curie vs. label
+                taxon,  # DB_Object_Taxon, normalized to NCBITaxon prefix
                 "",  # Encoded_by
                 entity.get('parents'),  # Parent_Protein
                 "",  # Protein_Containing_Complex_Members
@@ -160,10 +168,10 @@ def write_entity(self, entity):
                 prefix,  # DB
                 local_id,  # DB_Object_ID
                 entity.get('label'),  # DB_Object_Symbol
-                entity.get('full_name'),  # DB_Object_Symbol
-                entity.get('synonyms'),  # DB_Object_Name
-                entity.get('type'),  # DB_Object_Synonyms
-                normalize_taxon(entity.get("taxon").get("id")),  # taxon
+                entity.get('full_name'),  # DB_Object_Full_Name
+                entity.get('synonyms'),  # DB_Object_Synonyms
+                entity.get('type'),  # DB_Object_Type
+                normalize_taxon(entity.get("taxon").get("id")),  # taxon in gpi 1.2 was prefixed by `taxon:`
                 entity.get('parents'),  # Parent_Object_ID
                 entity.get('xrefs'),  # DB_Xref(s)
                 entity.get('properties')  # Properties