Skip to content

Commit

Permalink
Remove meta KG creation code from KG2c build #2411
Browse files Browse the repository at this point in the history
  • Loading branch information
amykglen committed Nov 19, 2024
1 parent abb7473 commit e7540de
Show file tree
Hide file tree
Showing 3 changed files with 6 additions and 92 deletions.
6 changes: 3 additions & 3 deletions code/kg2c/build_kg2c.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@

sys.path.append(os.path.dirname(os.path.abspath(__file__)))
from create_kg2c_files import create_kg2c_files
from record_kg2c_meta_info import record_meta_kg_info
from record_kg2c_meta_info import record_select_meta_info
import file_manager

KG2C_DIR = f"{os.path.dirname(os.path.abspath(__file__))}"
Expand Down Expand Up @@ -118,8 +118,8 @@ def main():
# Actually build KG2c
logging.info("Calling create_kg2c_files.py..")
create_kg2c_files(args.kg2pre_version, args.sub_version, args.biolink_version, synonymizer_name, args.test)
logging.info("Calling record_kg2c_meta_info.py..")
record_meta_kg_info(args.biolink_version, args.test)
logging.info("Calling record_select_meta_info.py..")
record_select_meta_info(args.biolink_version, args.test)

# Upload artifacts to the relevant places
file_manager.make_kg2c_tarball(args.test)
Expand Down
3 changes: 0 additions & 3 deletions code/kg2c/file_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -257,9 +257,6 @@ def upload_kg2c_files_to_arax_databases_server(kg2pre_version: str, sub_version:
upload_file_to_arax_databases_server(local_file_path=f"{KG2C_DIR}/kg2c.sqlite{test_suffix}",
remote_file_name=f"kg2c_{sub_version}_KG{kg2pre_version}.sqlite{test_suffix}",
kg2pre_version=kg2pre_version)
upload_file_to_arax_databases_server(local_file_path=f"{KG2C_DIR}/meta_kg.json{test_suffix}",
remote_file_name=f"meta_kg_{sub_version}_KG{kg2pre_version}c.json{test_suffix}",
kg2pre_version=kg2pre_version)
upload_file_to_arax_databases_server(local_file_path=f"{KG2C_DIR}/fda_approved_drugs.pickle{test_suffix}",
remote_file_name=f"fda_approved_drugs_{sub_version}_KG{kg2pre_version}c.pickle{test_suffix}",
kg2pre_version=kg2pre_version)
Expand Down
89 changes: 3 additions & 86 deletions code/kg2c/record_kg2c_meta_info.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,87 +22,6 @@
KG2C_DIR = f"{os.path.dirname(os.path.abspath(__file__))}"


def serialize_with_sets(obj: any) -> any:
# Thank you https://stackoverflow.com/a/60544597
if isinstance(obj, set):
return list(obj)
else:
return obj

def get_meta_qualifier(qualified_predicate, qualified_object_direction, qualified_object_aspect):
meta_qualifier = []
if (len(qualified_predicate) != 0):
meta_qualifier.append({"qualifier_type_id": "biolink:qualified_predicate", "applicable_values": qualified_predicate})
if(len(qualified_object_direction) != 0):
meta_qualifier.append({"qualifier_type_id": "biolink:object_direction_qualifier", "applicable_values": qualified_object_direction})
if(len(qualified_object_aspect) != 0):
meta_qualifier.append({"qualifier_type_id":"biolink:object_aspect_qualifier", "applicable_values": qualified_object_aspect})
return meta_qualifier
def add_edge_to_applicable_values(qualifier_dict, key, value): #Adds the qualifier value to the corresponding applicable values of the list
if (value != ""): #given the key nad the value is not alreadt present.
if(key in qualifier_dict):
if(value not in qualifier_dict[key]):
qualifier_dict[key].append(value)
else:
qualifier_dict[key] = [value]
else:
if(key not in qualifier_dict):
qualifier_dict[key] = []


def build_meta_kg(nodes_by_id: Dict[str, Dict[str, any]], edges_by_id: Dict[str, Dict[str, any]],
meta_kg_file_name: str, biolink_helper: BiolinkHelper, is_test: bool):
logging.info(f"Building meta KG..")
logging.info(" Gathering all meta triples..")
meta_triples = set()
qualified_predicate = {}
qualified_object_direction = {}
qualified_object_aspect = {}
for edge in edges_by_id.values():
subject_node_id = edge["subject"]
object_node_id = edge["object"]
if not is_test or (subject_node_id in nodes_by_id and object_node_id in nodes_by_id):
subject_node = nodes_by_id[subject_node_id]
object_node = nodes_by_id[object_node_id]
subject_categories = biolink_helper.add_conflations(subject_node["all_categories"])
object_categories = biolink_helper.add_conflations(object_node["all_categories"])
predicate = edge["predicate"]

for subject_category in subject_categories:
for object_category in object_categories:
add_edge_to_applicable_values(qualified_predicate, f"{subject_category}-{object_category}", edge["qualified_predicate"]) #Adding the qualified_predicate of the edge to the corresponding applicable values list for the object_category-subject_category pair
add_edge_to_applicable_values(qualified_object_direction, f"{subject_category}-{object_category}", edge["qualified_object_direction"]) #Adding the qualified_object_direction of the edge to the corresponding applicable values list for the object_category-subject_category pair
add_edge_to_applicable_values(qualified_object_aspect, f"{subject_category}-{object_category}", edge["qualified_object_aspect"]) #Adding the qualified_object_aspect of the edge to the corresponding applicable values list for the object_category-subject_category pair
meta_triples.add((subject_category, predicate, object_category))
kg2_infores_curie = "infores:rtx-kg2"

meta_edges = [{"subject": triple[0],
"predicate": triple[1],
"object": triple[2],
"qualifiers": get_meta_qualifier(qualified_predicate[f"{triple[0]}-{triple[2]}"], qualified_object_direction[f"{triple[0]}-{triple[2]}"], qualified_object_aspect[f"{triple[0]}-{triple[2]}"]) } if (qualified_predicate[f"{triple[0]}-{triple[2]}"] != []) else {"subject": triple[0],
"predicate": triple[1],
"object": triple[2]}
for triple in meta_triples]
logging.info(f" Created {len(meta_edges)} meta edges")

logging.info(" Gathering all meta nodes..")
with open(f"{KG2C_DIR}/equivalent_curies.pickle", "rb") as equiv_curies_file:
equivalent_curies_dict = pickle.load(equiv_curies_file)
meta_nodes = defaultdict(lambda: defaultdict(lambda: set()))
for node_id, node in nodes_by_id.items():
equivalent_curies = equivalent_curies_dict.get(node_id, [node_id])
prefixes = {curie.split(":")[0] for curie in equivalent_curies}
categories = biolink_helper.add_conflations(node["category"])
for category in categories:
meta_nodes[category]["id_prefixes"].update(prefixes)
logging.info(f" Created {len(meta_nodes)} meta nodes")

logging.info(" Saving meta KG to JSON file..")
meta_kg = {"nodes": meta_nodes, "edges": meta_edges}
with open(f"{KG2C_DIR}/{meta_kg_file_name}", "w+") as meta_kg_file:
json.dump(meta_kg, meta_kg_file, default=serialize_with_sets, indent=2)


def add_neighbor_counts_to_sqlite(nodes_by_id: Dict[str, Dict[str, any]], edges_by_id: Dict[str, Dict[str, any]],
sqlite_file_name: str, label_property_name: str):
logging.info("Counting up node neighbors by category..")
Expand Down Expand Up @@ -191,7 +110,7 @@ def generate_fda_approved_drugs_pickle(edges_by_id: Dict[str, Dict[str, any]], f
pickle.dump(fda_approved_drugs, pickle_file)


def record_meta_kg_info(biolink_version: str, is_test: bool):
def record_select_meta_info(biolink_version: str, is_test: bool):
logging.info("Starting to record KG2c meta info..")
bh = BiolinkHelper(biolink_version)
start = time.time()
Expand All @@ -209,15 +128,13 @@ def record_meta_kg_info(biolink_version: str, is_test: bool):
for node in nodes_by_id.values():
node[expanded_labels_property_name] = bh.get_ancestors(node["all_categories"], include_mixins=True)

meta_kg_file_name = f"meta_kg.json{'_TEST' if is_test else ''}"
sqlite_file_name = f"kg2c.sqlite{'_TEST' if is_test else ''}"
fda_approved_file_name = f"fda_approved_drugs.pickle{'_TEST' if is_test else ''}"
build_meta_kg(nodes_by_id, edges_by_id, meta_kg_file_name, bh, is_test)
add_neighbor_counts_to_sqlite(nodes_by_id, edges_by_id, sqlite_file_name, expanded_labels_property_name)
add_category_counts_to_sqlite(nodes_by_id, sqlite_file_name, expanded_labels_property_name)
generate_fda_approved_drugs_pickle(edges_by_id, fda_approved_file_name)

logging.info(f"Recording meta KG info took {round((time.time() - start) / 60, 1)} minutes.")
logging.info(f"Recording meta info took {round((time.time() - start) / 60, 1)} minutes.")


def main():
Expand All @@ -230,7 +147,7 @@ def main():
help="The Biolink version that the given KG2pre version uses (e.g., 4.0.1).")
arg_parser.add_argument("--test", dest="test", action='store_true', default=False)
args = arg_parser.parse_args()
record_meta_kg_info(args.biolink_version, args.test)
record_select_meta_info(args.biolink_version, args.test)


if __name__ == "__main__":
Expand Down

0 comments on commit e7540de

Please sign in to comment.