release commit
This commit is contained in:
307
biocypher/_mapping.py
Normal file
307
biocypher/_mapping.py
Normal file
@ -0,0 +1,307 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
#
|
||||
# Copyright 2021, Heidelberg University Clinic
|
||||
#
|
||||
# File author(s): Sebastian Lobentanzer
|
||||
# ...
|
||||
#
|
||||
# Distributed under MIT licence, see the file `LICENSE`.
|
||||
#
|
||||
"""
|
||||
BioCypher 'mapping' module. Handles the mapping of user-defined schema to the
|
||||
underlying ontology.
|
||||
"""
|
||||
from ._logger import logger
|
||||
|
||||
logger.debug(f"Loading module {__name__}.")
|
||||
|
||||
from typing import Optional
|
||||
from urllib.request import urlopen
|
||||
|
||||
import yaml
|
||||
|
||||
from . import _misc
|
||||
from ._config import config as _config
|
||||
|
||||
|
||||
class OntologyMapping:
|
||||
"""
|
||||
Class to store the ontology mapping and extensions.
|
||||
"""
|
||||
|
||||
def __init__(self, config_file: str = None):
|
||||
self.schema = self._read_config(config_file)
|
||||
|
||||
self.extended_schema = self._extend_schema()
|
||||
|
||||
def _read_config(self, config_file: str = None):
|
||||
"""
|
||||
Read the configuration file and store the ontology mapping and extensions.
|
||||
"""
|
||||
if config_file is None:
|
||||
schema_config = {}
|
||||
|
||||
# load yaml file from web
|
||||
elif config_file.startswith("http"):
|
||||
with urlopen(config_file) as f:
|
||||
schema_config = yaml.safe_load(f)
|
||||
|
||||
# get graph state from config (assume file is local)
|
||||
else:
|
||||
with open(config_file, "r") as f:
|
||||
schema_config = yaml.safe_load(f)
|
||||
|
||||
return schema_config
|
||||
|
||||
def _extend_schema(self, d: Optional[dict] = None) -> dict:
|
||||
"""
|
||||
Get leaves of the tree hierarchy from the data structure dict
|
||||
contained in the `schema_config.yaml`. Creates virtual leaves
|
||||
(as children) from entries that provide more than one preferred
|
||||
id type (and corresponding inputs).
|
||||
|
||||
Args:
|
||||
d:
|
||||
Data structure dict from yaml file.
|
||||
|
||||
"""
|
||||
|
||||
d = d or self.schema
|
||||
|
||||
extended_schema = dict()
|
||||
|
||||
# first pass: get parent leaves with direct representation in ontology
|
||||
for k, v in d.items():
|
||||
# k is not an entity
|
||||
if "represented_as" not in v:
|
||||
continue
|
||||
|
||||
# preferred_id optional: if not provided, use `id`
|
||||
if not v.get("preferred_id"):
|
||||
v["preferred_id"] = "id"
|
||||
|
||||
# k is an entity that is present in the ontology
|
||||
if "is_a" not in v:
|
||||
extended_schema[k] = v
|
||||
|
||||
# second pass: "vertical" inheritance
|
||||
d = self._vertical_property_inheritance(d)
|
||||
for k, v in d.items():
|
||||
if "is_a" in v:
|
||||
# prevent loops
|
||||
if k == v["is_a"]:
|
||||
logger.warning(
|
||||
f"Loop detected in ontology mapping: {k} -> {v}. "
|
||||
"Removing item. Please fix the inheritance if you want "
|
||||
"to use this item."
|
||||
)
|
||||
continue
|
||||
|
||||
extended_schema[k] = v
|
||||
|
||||
# "horizontal" inheritance: create siblings for multiple identifiers or
|
||||
# sources -> virtual leaves or implicit children
|
||||
mi_leaves = {}
|
||||
ms_leaves = {}
|
||||
for k, v in d.items():
|
||||
# k is not an entity
|
||||
if "represented_as" not in v:
|
||||
continue
|
||||
|
||||
if isinstance(v.get("preferred_id"), list):
|
||||
mi_leaves = self._horizontal_inheritance_pid(k, v)
|
||||
extended_schema.update(mi_leaves)
|
||||
|
||||
elif isinstance(v.get("source"), list):
|
||||
ms_leaves = self._horizontal_inheritance_source(k, v)
|
||||
extended_schema.update(ms_leaves)
|
||||
|
||||
return extended_schema
|
||||
|
||||
def _vertical_property_inheritance(self, d):
|
||||
"""
|
||||
Inherit properties from parents to children and update `d` accordingly.
|
||||
"""
|
||||
for k, v in d.items():
|
||||
# k is not an entity
|
||||
if "represented_as" not in v:
|
||||
continue
|
||||
|
||||
# k is an entity that is present in the ontology
|
||||
if "is_a" not in v:
|
||||
continue
|
||||
|
||||
# "vertical" inheritance: inherit properties from parent
|
||||
if v.get("inherit_properties", False):
|
||||
# get direct ancestor
|
||||
if isinstance(v["is_a"], list):
|
||||
parent = v["is_a"][0]
|
||||
else:
|
||||
parent = v["is_a"]
|
||||
|
||||
# ensure child has properties and exclude_properties
|
||||
if "properties" not in v:
|
||||
v["properties"] = {}
|
||||
if "exclude_properties" not in v:
|
||||
v["exclude_properties"] = {}
|
||||
|
||||
# update properties of child
|
||||
parent_props = self.schema[parent].get("properties", {})
|
||||
if parent_props:
|
||||
v["properties"].update(parent_props)
|
||||
|
||||
parent_excl_props = self.schema[parent].get(
|
||||
"exclude_properties", {}
|
||||
)
|
||||
if parent_excl_props:
|
||||
v["exclude_properties"].update(parent_excl_props)
|
||||
|
||||
# update schema (d)
|
||||
d[k] = v
|
||||
|
||||
return d
|
||||
|
||||
def _horizontal_inheritance_pid(self, key, value):
|
||||
"""
|
||||
Create virtual leaves for multiple preferred id types or sources.
|
||||
|
||||
If we create virtual leaves, input_label/label_in_input always has to be
|
||||
a list.
|
||||
"""
|
||||
|
||||
leaves = {}
|
||||
|
||||
preferred_id = value["preferred_id"]
|
||||
input_label = value.get("input_label") or value["label_in_input"]
|
||||
represented_as = value["represented_as"]
|
||||
|
||||
# adjust lengths
|
||||
max_l = max(
|
||||
[
|
||||
len(_misc.to_list(preferred_id)),
|
||||
len(_misc.to_list(input_label)),
|
||||
len(_misc.to_list(represented_as)),
|
||||
],
|
||||
)
|
||||
|
||||
# adjust pid length if necessary
|
||||
if isinstance(preferred_id, str):
|
||||
pids = [preferred_id] * max_l
|
||||
else:
|
||||
pids = preferred_id
|
||||
|
||||
# adjust rep length if necessary
|
||||
if isinstance(represented_as, str):
|
||||
reps = [represented_as] * max_l
|
||||
else:
|
||||
reps = represented_as
|
||||
|
||||
for pid, lab, rep in zip(pids, input_label, reps):
|
||||
skey = pid + "." + key
|
||||
svalue = {
|
||||
"preferred_id": pid,
|
||||
"input_label": lab,
|
||||
"represented_as": rep,
|
||||
# mark as virtual
|
||||
"virtual": True,
|
||||
}
|
||||
|
||||
# inherit is_a if exists
|
||||
if "is_a" in value.keys():
|
||||
# treat as multiple inheritance
|
||||
if isinstance(value["is_a"], list):
|
||||
v = list(value["is_a"])
|
||||
v.insert(0, key)
|
||||
svalue["is_a"] = v
|
||||
|
||||
else:
|
||||
svalue["is_a"] = [key, value["is_a"]]
|
||||
|
||||
else:
|
||||
# set parent as is_a
|
||||
svalue["is_a"] = key
|
||||
|
||||
# inherit everything except core attributes
|
||||
for k, v in value.items():
|
||||
if k not in [
|
||||
"is_a",
|
||||
"preferred_id",
|
||||
"input_label",
|
||||
"label_in_input",
|
||||
"represented_as",
|
||||
]:
|
||||
svalue[k] = v
|
||||
|
||||
leaves[skey] = svalue
|
||||
|
||||
return leaves
|
||||
|
||||
def _horizontal_inheritance_source(self, key, value):
|
||||
"""
|
||||
Create virtual leaves for multiple sources.
|
||||
|
||||
If we create virtual leaves, input_label/label_in_input always has to be
|
||||
a list.
|
||||
"""
|
||||
|
||||
leaves = {}
|
||||
|
||||
source = value["source"]
|
||||
input_label = value.get("input_label") or value["label_in_input"]
|
||||
represented_as = value["represented_as"]
|
||||
|
||||
# adjust lengths
|
||||
src_l = len(source)
|
||||
|
||||
# adjust label length if necessary
|
||||
if isinstance(input_label, str):
|
||||
labels = [input_label] * src_l
|
||||
else:
|
||||
labels = input_label
|
||||
|
||||
# adjust rep length if necessary
|
||||
if isinstance(represented_as, str):
|
||||
reps = [represented_as] * src_l
|
||||
else:
|
||||
reps = represented_as
|
||||
|
||||
for src, lab, rep in zip(source, labels, reps):
|
||||
skey = src + "." + key
|
||||
svalue = {
|
||||
"source": src,
|
||||
"input_label": lab,
|
||||
"represented_as": rep,
|
||||
# mark as virtual
|
||||
"virtual": True,
|
||||
}
|
||||
|
||||
# inherit is_a if exists
|
||||
if "is_a" in value.keys():
|
||||
# treat as multiple inheritance
|
||||
if isinstance(value["is_a"], list):
|
||||
v = list(value["is_a"])
|
||||
v.insert(0, key)
|
||||
svalue["is_a"] = v
|
||||
|
||||
else:
|
||||
svalue["is_a"] = [key, value["is_a"]]
|
||||
|
||||
else:
|
||||
# set parent as is_a
|
||||
svalue["is_a"] = key
|
||||
|
||||
# inherit everything except core attributes
|
||||
for k, v in value.items():
|
||||
if k not in [
|
||||
"is_a",
|
||||
"source",
|
||||
"input_label",
|
||||
"label_in_input",
|
||||
"represented_as",
|
||||
]:
|
||||
svalue[k] = v
|
||||
|
||||
leaves[skey] = svalue
|
||||
|
||||
return leaves
|
Reference in New Issue
Block a user