308 lines
9.1 KiB
Python
308 lines
9.1 KiB
Python
#!/usr/bin/env python
|
|
|
|
#
|
|
# Copyright 2021, Heidelberg University Clinic
|
|
#
|
|
# File author(s): Sebastian Lobentanzer
|
|
# ...
|
|
#
|
|
# Distributed under MIT licence, see the file `LICENSE`.
|
|
#
|
|
"""
|
|
BioCypher 'mapping' module. Handles the mapping of user-defined schema to the
|
|
underlying ontology.
|
|
"""
|
|
from ._logger import logger
|
|
|
|
logger.debug(f"Loading module {__name__}.")
|
|
|
|
from typing import Optional
|
|
from urllib.request import urlopen
|
|
|
|
import yaml
|
|
|
|
from . import _misc
|
|
from ._config import config as _config
|
|
|
|
|
|
class OntologyMapping:
|
|
"""
|
|
Class to store the ontology mapping and extensions.
|
|
"""
|
|
|
|
def __init__(self, config_file: str = None):
|
|
self.schema = self._read_config(config_file)
|
|
|
|
self.extended_schema = self._extend_schema()
|
|
|
|
def _read_config(self, config_file: str = None):
|
|
"""
|
|
Read the configuration file and store the ontology mapping and extensions.
|
|
"""
|
|
if config_file is None:
|
|
schema_config = {}
|
|
|
|
# load yaml file from web
|
|
elif config_file.startswith("http"):
|
|
with urlopen(config_file) as f:
|
|
schema_config = yaml.safe_load(f)
|
|
|
|
# get graph state from config (assume file is local)
|
|
else:
|
|
with open(config_file, "r") as f:
|
|
schema_config = yaml.safe_load(f)
|
|
|
|
return schema_config
|
|
|
|
def _extend_schema(self, d: Optional[dict] = None) -> dict:
|
|
"""
|
|
Get leaves of the tree hierarchy from the data structure dict
|
|
contained in the `schema_config.yaml`. Creates virtual leaves
|
|
(as children) from entries that provide more than one preferred
|
|
id type (and corresponding inputs).
|
|
|
|
Args:
|
|
d:
|
|
Data structure dict from yaml file.
|
|
|
|
"""
|
|
|
|
d = d or self.schema
|
|
|
|
extended_schema = dict()
|
|
|
|
# first pass: get parent leaves with direct representation in ontology
|
|
for k, v in d.items():
|
|
# k is not an entity
|
|
if "represented_as" not in v:
|
|
continue
|
|
|
|
# preferred_id optional: if not provided, use `id`
|
|
if not v.get("preferred_id"):
|
|
v["preferred_id"] = "id"
|
|
|
|
# k is an entity that is present in the ontology
|
|
if "is_a" not in v:
|
|
extended_schema[k] = v
|
|
|
|
# second pass: "vertical" inheritance
|
|
d = self._vertical_property_inheritance(d)
|
|
for k, v in d.items():
|
|
if "is_a" in v:
|
|
# prevent loops
|
|
if k == v["is_a"]:
|
|
logger.warning(
|
|
f"Loop detected in ontology mapping: {k} -> {v}. "
|
|
"Removing item. Please fix the inheritance if you want "
|
|
"to use this item."
|
|
)
|
|
continue
|
|
|
|
extended_schema[k] = v
|
|
|
|
# "horizontal" inheritance: create siblings for multiple identifiers or
|
|
# sources -> virtual leaves or implicit children
|
|
mi_leaves = {}
|
|
ms_leaves = {}
|
|
for k, v in d.items():
|
|
# k is not an entity
|
|
if "represented_as" not in v:
|
|
continue
|
|
|
|
if isinstance(v.get("preferred_id"), list):
|
|
mi_leaves = self._horizontal_inheritance_pid(k, v)
|
|
extended_schema.update(mi_leaves)
|
|
|
|
elif isinstance(v.get("source"), list):
|
|
ms_leaves = self._horizontal_inheritance_source(k, v)
|
|
extended_schema.update(ms_leaves)
|
|
|
|
return extended_schema
|
|
|
|
def _vertical_property_inheritance(self, d):
|
|
"""
|
|
Inherit properties from parents to children and update `d` accordingly.
|
|
"""
|
|
for k, v in d.items():
|
|
# k is not an entity
|
|
if "represented_as" not in v:
|
|
continue
|
|
|
|
# k is an entity that is present in the ontology
|
|
if "is_a" not in v:
|
|
continue
|
|
|
|
# "vertical" inheritance: inherit properties from parent
|
|
if v.get("inherit_properties", False):
|
|
# get direct ancestor
|
|
if isinstance(v["is_a"], list):
|
|
parent = v["is_a"][0]
|
|
else:
|
|
parent = v["is_a"]
|
|
|
|
# ensure child has properties and exclude_properties
|
|
if "properties" not in v:
|
|
v["properties"] = {}
|
|
if "exclude_properties" not in v:
|
|
v["exclude_properties"] = {}
|
|
|
|
# update properties of child
|
|
parent_props = self.schema[parent].get("properties", {})
|
|
if parent_props:
|
|
v["properties"].update(parent_props)
|
|
|
|
parent_excl_props = self.schema[parent].get(
|
|
"exclude_properties", {}
|
|
)
|
|
if parent_excl_props:
|
|
v["exclude_properties"].update(parent_excl_props)
|
|
|
|
# update schema (d)
|
|
d[k] = v
|
|
|
|
return d
|
|
|
|
def _horizontal_inheritance_pid(self, key, value):
|
|
"""
|
|
Create virtual leaves for multiple preferred id types or sources.
|
|
|
|
If we create virtual leaves, input_label/label_in_input always has to be
|
|
a list.
|
|
"""
|
|
|
|
leaves = {}
|
|
|
|
preferred_id = value["preferred_id"]
|
|
input_label = value.get("input_label") or value["label_in_input"]
|
|
represented_as = value["represented_as"]
|
|
|
|
# adjust lengths
|
|
max_l = max(
|
|
[
|
|
len(_misc.to_list(preferred_id)),
|
|
len(_misc.to_list(input_label)),
|
|
len(_misc.to_list(represented_as)),
|
|
],
|
|
)
|
|
|
|
# adjust pid length if necessary
|
|
if isinstance(preferred_id, str):
|
|
pids = [preferred_id] * max_l
|
|
else:
|
|
pids = preferred_id
|
|
|
|
# adjust rep length if necessary
|
|
if isinstance(represented_as, str):
|
|
reps = [represented_as] * max_l
|
|
else:
|
|
reps = represented_as
|
|
|
|
for pid, lab, rep in zip(pids, input_label, reps):
|
|
skey = pid + "." + key
|
|
svalue = {
|
|
"preferred_id": pid,
|
|
"input_label": lab,
|
|
"represented_as": rep,
|
|
# mark as virtual
|
|
"virtual": True,
|
|
}
|
|
|
|
# inherit is_a if exists
|
|
if "is_a" in value.keys():
|
|
# treat as multiple inheritance
|
|
if isinstance(value["is_a"], list):
|
|
v = list(value["is_a"])
|
|
v.insert(0, key)
|
|
svalue["is_a"] = v
|
|
|
|
else:
|
|
svalue["is_a"] = [key, value["is_a"]]
|
|
|
|
else:
|
|
# set parent as is_a
|
|
svalue["is_a"] = key
|
|
|
|
# inherit everything except core attributes
|
|
for k, v in value.items():
|
|
if k not in [
|
|
"is_a",
|
|
"preferred_id",
|
|
"input_label",
|
|
"label_in_input",
|
|
"represented_as",
|
|
]:
|
|
svalue[k] = v
|
|
|
|
leaves[skey] = svalue
|
|
|
|
return leaves
|
|
|
|
def _horizontal_inheritance_source(self, key, value):
|
|
"""
|
|
Create virtual leaves for multiple sources.
|
|
|
|
If we create virtual leaves, input_label/label_in_input always has to be
|
|
a list.
|
|
"""
|
|
|
|
leaves = {}
|
|
|
|
source = value["source"]
|
|
input_label = value.get("input_label") or value["label_in_input"]
|
|
represented_as = value["represented_as"]
|
|
|
|
# adjust lengths
|
|
src_l = len(source)
|
|
|
|
# adjust label length if necessary
|
|
if isinstance(input_label, str):
|
|
labels = [input_label] * src_l
|
|
else:
|
|
labels = input_label
|
|
|
|
# adjust rep length if necessary
|
|
if isinstance(represented_as, str):
|
|
reps = [represented_as] * src_l
|
|
else:
|
|
reps = represented_as
|
|
|
|
for src, lab, rep in zip(source, labels, reps):
|
|
skey = src + "." + key
|
|
svalue = {
|
|
"source": src,
|
|
"input_label": lab,
|
|
"represented_as": rep,
|
|
# mark as virtual
|
|
"virtual": True,
|
|
}
|
|
|
|
# inherit is_a if exists
|
|
if "is_a" in value.keys():
|
|
# treat as multiple inheritance
|
|
if isinstance(value["is_a"], list):
|
|
v = list(value["is_a"])
|
|
v.insert(0, key)
|
|
svalue["is_a"] = v
|
|
|
|
else:
|
|
svalue["is_a"] = [key, value["is_a"]]
|
|
|
|
else:
|
|
# set parent as is_a
|
|
svalue["is_a"] = key
|
|
|
|
# inherit everything except core attributes
|
|
for k, v in value.items():
|
|
if k not in [
|
|
"is_a",
|
|
"source",
|
|
"input_label",
|
|
"label_in_input",
|
|
"represented_as",
|
|
]:
|
|
svalue[k] = v
|
|
|
|
leaves[skey] = svalue
|
|
|
|
return leaves
|