2025-04-16 22:12:19 +02:00

357 lines
10 KiB
Python

#!/usr/bin/env python
#
# Copyright 2021, Heidelberg University Clinic
#
# File author(s): Sebastian Lobentanzer
# ...
#
# Distributed under MIT licence, see the file `LICENSE`.
#
"""
BioCypher 'create' module. Handles the creation of BioCypher node and edge
dataclasses.
"""
from ._logger import logger
logger.debug(f"Loading module {__name__}.")
from typing import Union
from dataclasses import field, dataclass
import os
__all__ = [
"BioCypherEdge",
"BioCypherNode",
"BioCypherRelAsNode",
]
@dataclass(frozen=True)
class BioCypherNode:
"""
Handoff class to represent biomedical entities as Neo4j nodes.
Has id, label, property dict; id and label (in the Neo4j sense of a
label, ie, the entity descriptor after the colon, such as
":Protein") are non-optional and called node_id and node_label to
avoid confusion with "label" properties. Node labels are written in
PascalCase and as nouns, as per Neo4j consensus.
Args:
node_id (string): consensus "best" id for biological entity
node_label (string): primary type of entity, capitalised
**properties (kwargs): collection of all other properties to be
passed to neo4j for the respective node (dict)
Todo:
- check and correct small inconsistencies such as capitalisation
of ID names ("uniprot" vs "UniProt")
- check for correct ID patterns (eg "ENSG" + string of numbers,
uniprot length)
- ID conversion using pypath translation facilities for now
"""
node_id: str
node_label: str
preferred_id: str = "id"
properties: dict = field(default_factory=dict)
def __post_init__(self):
"""
Add id field to properties.
Check for reserved keywords.
Replace unwanted characters in properties.
"""
self.properties["id"] = self.node_id
self.properties["preferred_id"] = self.preferred_id or None
# TODO actually make None possible here; as is, "id" is the default in
# the dataclass as well as in the configuration file
if ":TYPE" in self.properties.keys():
logger.warning(
"Keyword ':TYPE' is reserved for Neo4j. "
"Removing from properties.",
# "Renaming to 'type'."
)
# self.properties["type"] = self.properties[":TYPE"]
del self.properties[":TYPE"]
for k, v in self.properties.items():
if isinstance(v, str):
self.properties[k] = (
v.replace(
os.linesep,
" ",
)
.replace(
"\n",
" ",
)
.replace(
"\r",
" ",
)
)
elif isinstance(v, list):
#modified biocypher, because the data contained intgers in lists
self.properties[k] = [
(str(val) if isinstance(val, (int, float)) else val)
.replace(os.linesep, " ")
.replace("\n", " ")
.replace("\r", " ")
for val in v
]
def get_id(self) -> str:
"""
Returns primary node identifier.
Returns:
str: node_id
"""
return self.node_id
def get_label(self) -> str:
"""
Returns primary node label.
Returns:
str: node_label
"""
return self.node_label
def get_type(self) -> str:
"""
Returns primary node label.
Returns:
str: node_label
"""
return self.node_label
def get_preferred_id(self) -> str:
"""
Returns preferred id.
Returns:
str: preferred_id
"""
return self.preferred_id
def get_properties(self) -> dict:
"""
Returns all other node properties apart from primary id and
label as key-value pairs.
Returns:
dict: properties
"""
return self.properties
def get_dict(self) -> dict:
"""
Return dict of id, labels, and properties.
Returns:
dict: node_id and node_label as top-level key-value pairs,
properties as second-level dict.
"""
return {
"node_id": self.node_id,
"node_label": self.node_label,
"properties": self.properties,
}
@dataclass(frozen=True)
class BioCypherEdge:
"""
Handoff class to represent biomedical relationships in Neo4j.
Has source and target ids, label, property dict; ids and label (in
the Neo4j sense of a label, ie, the entity descriptor after the
colon, such as ":TARGETS") are non-optional and called source_id,
target_id, and relationship_label to avoid confusion with properties
called "label", which usually denotes the human-readable form.
Relationship labels are written in UPPERCASE and as verbs, as per
Neo4j consensus.
Args:
source_id (string): consensus "best" id for biological entity
target_id (string): consensus "best" id for biological entity
relationship_label (string): type of interaction, UPPERCASE
properties (dict): collection of all other properties of the
respective edge
"""
source_id: str
target_id: str
relationship_label: str
relationship_id: str = None
properties: dict = field(default_factory=dict)
def __post_init__(self):
"""
Check for reserved keywords.
"""
if ":TYPE" in self.properties.keys():
logger.debug(
"Keyword ':TYPE' is reserved for Neo4j. "
"Removing from properties.",
# "Renaming to 'type'."
)
# self.properties["type"] = self.properties[":TYPE"]
del self.properties[":TYPE"]
elif "id" in self.properties.keys():
logger.debug(
"Keyword 'id' is reserved for Neo4j. "
"Removing from properties.",
# "Renaming to 'type'."
)
# self.properties["type"] = self.properties[":TYPE"]
del self.properties["id"]
elif "_ID" in self.properties.keys():
logger.debug(
"Keyword '_ID' is reserved for Postgres. "
"Removing from properties.",
# "Renaming to 'type'."
)
# self.properties["type"] = self.properties[":TYPE"]
del self.properties["_ID"]
def get_id(self) -> Union[str, None]:
"""
Returns primary node identifier or None.
Returns:
str: node_id
"""
return self.relationship_id
def get_source_id(self) -> str:
"""
Returns primary node identifier of relationship source.
Returns:
str: source_id
"""
return self.source_id
def get_target_id(self) -> str:
"""
Returns primary node identifier of relationship target.
Returns:
str: target_id
"""
return self.target_id
def get_label(self) -> str:
"""
Returns relationship label.
Returns:
str: relationship_label
"""
return self.relationship_label
def get_type(self) -> str:
"""
Returns relationship label.
Returns:
str: relationship_label
"""
return self.relationship_label
def get_properties(self) -> dict:
"""
Returns all other relationship properties apart from primary ids
and label as key-value pairs.
Returns:
dict: properties
"""
return self.properties
def get_dict(self) -> dict:
"""
Return dict of ids, label, and properties.
Returns:
dict: source_id, target_id and relationship_label as
top-level key-value pairs, properties as second-level
dict.
"""
return {
"relationship_id": self.relationship_id or None,
"source_id": self.source_id,
"target_id": self.target_id,
"relationship_label": self.relationship_label,
"properties": self.properties,
}
@dataclass(frozen=True)
class BioCypherRelAsNode:
"""
Class to represent relationships as nodes (with in- and outgoing
edges) as a triplet of a BioCypherNode and two BioCypherEdges. Main
usage in type checking (instances where the receiving function needs
to check whether it receives a relationship as a single edge or as
a triplet).
Args:
node (BioCypherNode): node representing the relationship
source_edge (BioCypherEdge): edge representing the source of the
relationship
target_edge (BioCypherEdge): edge representing the target of the
relationship
"""
node: BioCypherNode
source_edge: BioCypherEdge
target_edge: BioCypherEdge
def __post_init__(self):
if not isinstance(self.node, BioCypherNode):
raise TypeError(
f"BioCypherRelAsNode.node must be a BioCypherNode, "
f"not {type(self.node)}.",
)
if not isinstance(self.source_edge, BioCypherEdge):
raise TypeError(
f"BioCypherRelAsNode.source_edge must be a BioCypherEdge, "
f"not {type(self.source_edge)}.",
)
if not isinstance(self.target_edge, BioCypherEdge):
raise TypeError(
f"BioCypherRelAsNode.target_edge must be a BioCypherEdge, "
f"not {type(self.target_edge)}.",
)
def get_node(self) -> BioCypherNode:
return self.node
def get_source_edge(self) -> BioCypherEdge:
return self.source_edge
def get_target_edge(self) -> BioCypherEdge:
return self.target_edge