2025-04-16 22:12:19 +02:00

308 lines
9.1 KiB
Python

#!/usr/bin/env python
#
# Copyright 2021, Heidelberg University Clinic
#
# File author(s): Sebastian Lobentanzer
# ...
#
# Distributed under MIT licence, see the file `LICENSE`.
#
"""
BioCypher 'mapping' module. Handles the mapping of user-defined schema to the
underlying ontology.
"""
from ._logger import logger
logger.debug(f"Loading module {__name__}.")
from typing import Optional
from urllib.request import urlopen
import yaml
from . import _misc
from ._config import config as _config
class OntologyMapping:
"""
Class to store the ontology mapping and extensions.
"""
def __init__(self, config_file: str = None):
self.schema = self._read_config(config_file)
self.extended_schema = self._extend_schema()
def _read_config(self, config_file: str = None):
"""
Read the configuration file and store the ontology mapping and extensions.
"""
if config_file is None:
schema_config = {}
# load yaml file from web
elif config_file.startswith("http"):
with urlopen(config_file) as f:
schema_config = yaml.safe_load(f)
# get graph state from config (assume file is local)
else:
with open(config_file, "r") as f:
schema_config = yaml.safe_load(f)
return schema_config
def _extend_schema(self, d: Optional[dict] = None) -> dict:
"""
Get leaves of the tree hierarchy from the data structure dict
contained in the `schema_config.yaml`. Creates virtual leaves
(as children) from entries that provide more than one preferred
id type (and corresponding inputs).
Args:
d:
Data structure dict from yaml file.
"""
d = d or self.schema
extended_schema = dict()
# first pass: get parent leaves with direct representation in ontology
for k, v in d.items():
# k is not an entity
if "represented_as" not in v:
continue
# preferred_id optional: if not provided, use `id`
if not v.get("preferred_id"):
v["preferred_id"] = "id"
# k is an entity that is present in the ontology
if "is_a" not in v:
extended_schema[k] = v
# second pass: "vertical" inheritance
d = self._vertical_property_inheritance(d)
for k, v in d.items():
if "is_a" in v:
# prevent loops
if k == v["is_a"]:
logger.warning(
f"Loop detected in ontology mapping: {k} -> {v}. "
"Removing item. Please fix the inheritance if you want "
"to use this item."
)
continue
extended_schema[k] = v
# "horizontal" inheritance: create siblings for multiple identifiers or
# sources -> virtual leaves or implicit children
mi_leaves = {}
ms_leaves = {}
for k, v in d.items():
# k is not an entity
if "represented_as" not in v:
continue
if isinstance(v.get("preferred_id"), list):
mi_leaves = self._horizontal_inheritance_pid(k, v)
extended_schema.update(mi_leaves)
elif isinstance(v.get("source"), list):
ms_leaves = self._horizontal_inheritance_source(k, v)
extended_schema.update(ms_leaves)
return extended_schema
def _vertical_property_inheritance(self, d):
"""
Inherit properties from parents to children and update `d` accordingly.
"""
for k, v in d.items():
# k is not an entity
if "represented_as" not in v:
continue
# k is an entity that is present in the ontology
if "is_a" not in v:
continue
# "vertical" inheritance: inherit properties from parent
if v.get("inherit_properties", False):
# get direct ancestor
if isinstance(v["is_a"], list):
parent = v["is_a"][0]
else:
parent = v["is_a"]
# ensure child has properties and exclude_properties
if "properties" not in v:
v["properties"] = {}
if "exclude_properties" not in v:
v["exclude_properties"] = {}
# update properties of child
parent_props = self.schema[parent].get("properties", {})
if parent_props:
v["properties"].update(parent_props)
parent_excl_props = self.schema[parent].get(
"exclude_properties", {}
)
if parent_excl_props:
v["exclude_properties"].update(parent_excl_props)
# update schema (d)
d[k] = v
return d
def _horizontal_inheritance_pid(self, key, value):
"""
Create virtual leaves for multiple preferred id types or sources.
If we create virtual leaves, input_label/label_in_input always has to be
a list.
"""
leaves = {}
preferred_id = value["preferred_id"]
input_label = value.get("input_label") or value["label_in_input"]
represented_as = value["represented_as"]
# adjust lengths
max_l = max(
[
len(_misc.to_list(preferred_id)),
len(_misc.to_list(input_label)),
len(_misc.to_list(represented_as)),
],
)
# adjust pid length if necessary
if isinstance(preferred_id, str):
pids = [preferred_id] * max_l
else:
pids = preferred_id
# adjust rep length if necessary
if isinstance(represented_as, str):
reps = [represented_as] * max_l
else:
reps = represented_as
for pid, lab, rep in zip(pids, input_label, reps):
skey = pid + "." + key
svalue = {
"preferred_id": pid,
"input_label": lab,
"represented_as": rep,
# mark as virtual
"virtual": True,
}
# inherit is_a if exists
if "is_a" in value.keys():
# treat as multiple inheritance
if isinstance(value["is_a"], list):
v = list(value["is_a"])
v.insert(0, key)
svalue["is_a"] = v
else:
svalue["is_a"] = [key, value["is_a"]]
else:
# set parent as is_a
svalue["is_a"] = key
# inherit everything except core attributes
for k, v in value.items():
if k not in [
"is_a",
"preferred_id",
"input_label",
"label_in_input",
"represented_as",
]:
svalue[k] = v
leaves[skey] = svalue
return leaves
def _horizontal_inheritance_source(self, key, value):
"""
Create virtual leaves for multiple sources.
If we create virtual leaves, input_label/label_in_input always has to be
a list.
"""
leaves = {}
source = value["source"]
input_label = value.get("input_label") or value["label_in_input"]
represented_as = value["represented_as"]
# adjust lengths
src_l = len(source)
# adjust label length if necessary
if isinstance(input_label, str):
labels = [input_label] * src_l
else:
labels = input_label
# adjust rep length if necessary
if isinstance(represented_as, str):
reps = [represented_as] * src_l
else:
reps = represented_as
for src, lab, rep in zip(source, labels, reps):
skey = src + "." + key
svalue = {
"source": src,
"input_label": lab,
"represented_as": rep,
# mark as virtual
"virtual": True,
}
# inherit is_a if exists
if "is_a" in value.keys():
# treat as multiple inheritance
if isinstance(value["is_a"], list):
v = list(value["is_a"])
v.insert(0, key)
svalue["is_a"] = v
else:
svalue["is_a"] = [key, value["is_a"]]
else:
# set parent as is_a
svalue["is_a"] = key
# inherit everything except core attributes
for k, v in value.items():
if k not in [
"is_a",
"source",
"input_label",
"label_in_input",
"represented_as",
]:
svalue[k] = v
leaves[skey] = svalue
return leaves