357 lines
10 KiB
Python
357 lines
10 KiB
Python
#!/usr/bin/env python
|
|
#
|
|
# Copyright 2021, Heidelberg University Clinic
|
|
#
|
|
# File author(s): Sebastian Lobentanzer
|
|
# ...
|
|
#
|
|
# Distributed under MIT licence, see the file `LICENSE`.
|
|
#
|
|
"""
|
|
BioCypher 'create' module. Handles the creation of BioCypher node and edge
|
|
dataclasses.
|
|
"""
|
|
from ._logger import logger
|
|
|
|
logger.debug(f"Loading module {__name__}.")
|
|
|
|
from typing import Union
|
|
from dataclasses import field, dataclass
|
|
import os
|
|
|
|
__all__ = [
|
|
"BioCypherEdge",
|
|
"BioCypherNode",
|
|
"BioCypherRelAsNode",
|
|
]
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class BioCypherNode:
|
|
"""
|
|
Handoff class to represent biomedical entities as Neo4j nodes.
|
|
|
|
Has id, label, property dict; id and label (in the Neo4j sense of a
|
|
label, ie, the entity descriptor after the colon, such as
|
|
":Protein") are non-optional and called node_id and node_label to
|
|
avoid confusion with "label" properties. Node labels are written in
|
|
PascalCase and as nouns, as per Neo4j consensus.
|
|
|
|
Args:
|
|
node_id (string): consensus "best" id for biological entity
|
|
node_label (string): primary type of entity, capitalised
|
|
**properties (kwargs): collection of all other properties to be
|
|
passed to neo4j for the respective node (dict)
|
|
|
|
Todo:
|
|
- check and correct small inconsistencies such as capitalisation
|
|
of ID names ("uniprot" vs "UniProt")
|
|
- check for correct ID patterns (eg "ENSG" + string of numbers,
|
|
uniprot length)
|
|
- ID conversion using pypath translation facilities for now
|
|
"""
|
|
|
|
node_id: str
|
|
node_label: str
|
|
preferred_id: str = "id"
|
|
properties: dict = field(default_factory=dict)
|
|
|
|
def __post_init__(self):
|
|
"""
|
|
Add id field to properties.
|
|
|
|
Check for reserved keywords.
|
|
|
|
Replace unwanted characters in properties.
|
|
"""
|
|
self.properties["id"] = self.node_id
|
|
self.properties["preferred_id"] = self.preferred_id or None
|
|
# TODO actually make None possible here; as is, "id" is the default in
|
|
# the dataclass as well as in the configuration file
|
|
|
|
if ":TYPE" in self.properties.keys():
|
|
logger.warning(
|
|
"Keyword ':TYPE' is reserved for Neo4j. "
|
|
"Removing from properties.",
|
|
# "Renaming to 'type'."
|
|
)
|
|
# self.properties["type"] = self.properties[":TYPE"]
|
|
del self.properties[":TYPE"]
|
|
|
|
for k, v in self.properties.items():
|
|
if isinstance(v, str):
|
|
self.properties[k] = (
|
|
v.replace(
|
|
os.linesep,
|
|
" ",
|
|
)
|
|
.replace(
|
|
"\n",
|
|
" ",
|
|
)
|
|
.replace(
|
|
"\r",
|
|
" ",
|
|
)
|
|
)
|
|
|
|
elif isinstance(v, list):
|
|
#modified biocypher, because the data contained intgers in lists
|
|
self.properties[k] = [
|
|
(str(val) if isinstance(val, (int, float)) else val)
|
|
.replace(os.linesep, " ")
|
|
.replace("\n", " ")
|
|
.replace("\r", " ")
|
|
for val in v
|
|
]
|
|
|
|
def get_id(self) -> str:
|
|
"""
|
|
Returns primary node identifier.
|
|
|
|
Returns:
|
|
str: node_id
|
|
"""
|
|
return self.node_id
|
|
|
|
def get_label(self) -> str:
|
|
"""
|
|
Returns primary node label.
|
|
|
|
Returns:
|
|
str: node_label
|
|
"""
|
|
return self.node_label
|
|
|
|
def get_type(self) -> str:
|
|
"""
|
|
Returns primary node label.
|
|
|
|
Returns:
|
|
str: node_label
|
|
"""
|
|
return self.node_label
|
|
|
|
def get_preferred_id(self) -> str:
|
|
"""
|
|
Returns preferred id.
|
|
|
|
Returns:
|
|
str: preferred_id
|
|
"""
|
|
return self.preferred_id
|
|
|
|
def get_properties(self) -> dict:
|
|
"""
|
|
Returns all other node properties apart from primary id and
|
|
label as key-value pairs.
|
|
|
|
Returns:
|
|
dict: properties
|
|
"""
|
|
return self.properties
|
|
|
|
def get_dict(self) -> dict:
|
|
"""
|
|
Return dict of id, labels, and properties.
|
|
|
|
Returns:
|
|
dict: node_id and node_label as top-level key-value pairs,
|
|
properties as second-level dict.
|
|
"""
|
|
return {
|
|
"node_id": self.node_id,
|
|
"node_label": self.node_label,
|
|
"properties": self.properties,
|
|
}
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class BioCypherEdge:
|
|
"""
|
|
Handoff class to represent biomedical relationships in Neo4j.
|
|
|
|
Has source and target ids, label, property dict; ids and label (in
|
|
the Neo4j sense of a label, ie, the entity descriptor after the
|
|
colon, such as ":TARGETS") are non-optional and called source_id,
|
|
target_id, and relationship_label to avoid confusion with properties
|
|
called "label", which usually denotes the human-readable form.
|
|
Relationship labels are written in UPPERCASE and as verbs, as per
|
|
Neo4j consensus.
|
|
|
|
Args:
|
|
|
|
source_id (string): consensus "best" id for biological entity
|
|
|
|
target_id (string): consensus "best" id for biological entity
|
|
|
|
relationship_label (string): type of interaction, UPPERCASE
|
|
|
|
properties (dict): collection of all other properties of the
|
|
respective edge
|
|
|
|
"""
|
|
|
|
source_id: str
|
|
target_id: str
|
|
relationship_label: str
|
|
relationship_id: str = None
|
|
properties: dict = field(default_factory=dict)
|
|
|
|
def __post_init__(self):
|
|
"""
|
|
Check for reserved keywords.
|
|
"""
|
|
|
|
if ":TYPE" in self.properties.keys():
|
|
logger.debug(
|
|
"Keyword ':TYPE' is reserved for Neo4j. "
|
|
"Removing from properties.",
|
|
# "Renaming to 'type'."
|
|
)
|
|
# self.properties["type"] = self.properties[":TYPE"]
|
|
del self.properties[":TYPE"]
|
|
elif "id" in self.properties.keys():
|
|
logger.debug(
|
|
"Keyword 'id' is reserved for Neo4j. "
|
|
"Removing from properties.",
|
|
# "Renaming to 'type'."
|
|
)
|
|
# self.properties["type"] = self.properties[":TYPE"]
|
|
del self.properties["id"]
|
|
elif "_ID" in self.properties.keys():
|
|
logger.debug(
|
|
"Keyword '_ID' is reserved for Postgres. "
|
|
"Removing from properties.",
|
|
# "Renaming to 'type'."
|
|
)
|
|
# self.properties["type"] = self.properties[":TYPE"]
|
|
del self.properties["_ID"]
|
|
|
|
def get_id(self) -> Union[str, None]:
|
|
"""
|
|
Returns primary node identifier or None.
|
|
|
|
Returns:
|
|
str: node_id
|
|
"""
|
|
|
|
return self.relationship_id
|
|
|
|
def get_source_id(self) -> str:
|
|
"""
|
|
Returns primary node identifier of relationship source.
|
|
|
|
Returns:
|
|
str: source_id
|
|
"""
|
|
return self.source_id
|
|
|
|
def get_target_id(self) -> str:
|
|
"""
|
|
Returns primary node identifier of relationship target.
|
|
|
|
Returns:
|
|
str: target_id
|
|
"""
|
|
return self.target_id
|
|
|
|
def get_label(self) -> str:
|
|
"""
|
|
Returns relationship label.
|
|
|
|
Returns:
|
|
str: relationship_label
|
|
"""
|
|
return self.relationship_label
|
|
|
|
def get_type(self) -> str:
|
|
"""
|
|
Returns relationship label.
|
|
|
|
Returns:
|
|
str: relationship_label
|
|
"""
|
|
return self.relationship_label
|
|
|
|
def get_properties(self) -> dict:
|
|
"""
|
|
Returns all other relationship properties apart from primary ids
|
|
and label as key-value pairs.
|
|
|
|
Returns:
|
|
dict: properties
|
|
"""
|
|
return self.properties
|
|
|
|
def get_dict(self) -> dict:
|
|
"""
|
|
Return dict of ids, label, and properties.
|
|
|
|
Returns:
|
|
dict: source_id, target_id and relationship_label as
|
|
top-level key-value pairs, properties as second-level
|
|
dict.
|
|
"""
|
|
return {
|
|
"relationship_id": self.relationship_id or None,
|
|
"source_id": self.source_id,
|
|
"target_id": self.target_id,
|
|
"relationship_label": self.relationship_label,
|
|
"properties": self.properties,
|
|
}
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class BioCypherRelAsNode:
|
|
"""
|
|
Class to represent relationships as nodes (with in- and outgoing
|
|
edges) as a triplet of a BioCypherNode and two BioCypherEdges. Main
|
|
usage in type checking (instances where the receiving function needs
|
|
to check whether it receives a relationship as a single edge or as
|
|
a triplet).
|
|
|
|
Args:
|
|
|
|
node (BioCypherNode): node representing the relationship
|
|
|
|
source_edge (BioCypherEdge): edge representing the source of the
|
|
relationship
|
|
|
|
target_edge (BioCypherEdge): edge representing the target of the
|
|
relationship
|
|
|
|
"""
|
|
|
|
node: BioCypherNode
|
|
source_edge: BioCypherEdge
|
|
target_edge: BioCypherEdge
|
|
|
|
def __post_init__(self):
|
|
if not isinstance(self.node, BioCypherNode):
|
|
raise TypeError(
|
|
f"BioCypherRelAsNode.node must be a BioCypherNode, "
|
|
f"not {type(self.node)}.",
|
|
)
|
|
|
|
if not isinstance(self.source_edge, BioCypherEdge):
|
|
raise TypeError(
|
|
f"BioCypherRelAsNode.source_edge must be a BioCypherEdge, "
|
|
f"not {type(self.source_edge)}.",
|
|
)
|
|
|
|
if not isinstance(self.target_edge, BioCypherEdge):
|
|
raise TypeError(
|
|
f"BioCypherRelAsNode.target_edge must be a BioCypherEdge, "
|
|
f"not {type(self.target_edge)}.",
|
|
)
|
|
|
|
def get_node(self) -> BioCypherNode:
|
|
return self.node
|
|
|
|
def get_source_edge(self) -> BioCypherEdge:
|
|
return self.source_edge
|
|
|
|
def get_target_edge(self) -> BioCypherEdge:
|
|
return self.target_edge
|