release commit
This commit is contained in:
356
biocypher/_create.py
Normal file
356
biocypher/_create.py
Normal file
@ -0,0 +1,356 @@
|
||||
#!/usr/bin/env python
|
||||
#
|
||||
# Copyright 2021, Heidelberg University Clinic
|
||||
#
|
||||
# File author(s): Sebastian Lobentanzer
|
||||
# ...
|
||||
#
|
||||
# Distributed under MIT licence, see the file `LICENSE`.
|
||||
#
|
||||
"""
|
||||
BioCypher 'create' module. Handles the creation of BioCypher node and edge
|
||||
dataclasses.
|
||||
"""
|
||||
from ._logger import logger
|
||||
|
||||
logger.debug(f"Loading module {__name__}.")
|
||||
|
||||
from typing import Union
|
||||
from dataclasses import field, dataclass
|
||||
import os
|
||||
|
||||
__all__ = [
|
||||
"BioCypherEdge",
|
||||
"BioCypherNode",
|
||||
"BioCypherRelAsNode",
|
||||
]
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class BioCypherNode:
|
||||
"""
|
||||
Handoff class to represent biomedical entities as Neo4j nodes.
|
||||
|
||||
Has id, label, property dict; id and label (in the Neo4j sense of a
|
||||
label, ie, the entity descriptor after the colon, such as
|
||||
":Protein") are non-optional and called node_id and node_label to
|
||||
avoid confusion with "label" properties. Node labels are written in
|
||||
PascalCase and as nouns, as per Neo4j consensus.
|
||||
|
||||
Args:
|
||||
node_id (string): consensus "best" id for biological entity
|
||||
node_label (string): primary type of entity, capitalised
|
||||
**properties (kwargs): collection of all other properties to be
|
||||
passed to neo4j for the respective node (dict)
|
||||
|
||||
Todo:
|
||||
- check and correct small inconsistencies such as capitalisation
|
||||
of ID names ("uniprot" vs "UniProt")
|
||||
- check for correct ID patterns (eg "ENSG" + string of numbers,
|
||||
uniprot length)
|
||||
- ID conversion using pypath translation facilities for now
|
||||
"""
|
||||
|
||||
node_id: str
|
||||
node_label: str
|
||||
preferred_id: str = "id"
|
||||
properties: dict = field(default_factory=dict)
|
||||
|
||||
def __post_init__(self):
|
||||
"""
|
||||
Add id field to properties.
|
||||
|
||||
Check for reserved keywords.
|
||||
|
||||
Replace unwanted characters in properties.
|
||||
"""
|
||||
self.properties["id"] = self.node_id
|
||||
self.properties["preferred_id"] = self.preferred_id or None
|
||||
# TODO actually make None possible here; as is, "id" is the default in
|
||||
# the dataclass as well as in the configuration file
|
||||
|
||||
if ":TYPE" in self.properties.keys():
|
||||
logger.warning(
|
||||
"Keyword ':TYPE' is reserved for Neo4j. "
|
||||
"Removing from properties.",
|
||||
# "Renaming to 'type'."
|
||||
)
|
||||
# self.properties["type"] = self.properties[":TYPE"]
|
||||
del self.properties[":TYPE"]
|
||||
|
||||
for k, v in self.properties.items():
|
||||
if isinstance(v, str):
|
||||
self.properties[k] = (
|
||||
v.replace(
|
||||
os.linesep,
|
||||
" ",
|
||||
)
|
||||
.replace(
|
||||
"\n",
|
||||
" ",
|
||||
)
|
||||
.replace(
|
||||
"\r",
|
||||
" ",
|
||||
)
|
||||
)
|
||||
|
||||
elif isinstance(v, list):
|
||||
#modified biocypher, because the data contained intgers in lists
|
||||
self.properties[k] = [
|
||||
(str(val) if isinstance(val, (int, float)) else val)
|
||||
.replace(os.linesep, " ")
|
||||
.replace("\n", " ")
|
||||
.replace("\r", " ")
|
||||
for val in v
|
||||
]
|
||||
|
||||
def get_id(self) -> str:
|
||||
"""
|
||||
Returns primary node identifier.
|
||||
|
||||
Returns:
|
||||
str: node_id
|
||||
"""
|
||||
return self.node_id
|
||||
|
||||
def get_label(self) -> str:
|
||||
"""
|
||||
Returns primary node label.
|
||||
|
||||
Returns:
|
||||
str: node_label
|
||||
"""
|
||||
return self.node_label
|
||||
|
||||
def get_type(self) -> str:
|
||||
"""
|
||||
Returns primary node label.
|
||||
|
||||
Returns:
|
||||
str: node_label
|
||||
"""
|
||||
return self.node_label
|
||||
|
||||
def get_preferred_id(self) -> str:
|
||||
"""
|
||||
Returns preferred id.
|
||||
|
||||
Returns:
|
||||
str: preferred_id
|
||||
"""
|
||||
return self.preferred_id
|
||||
|
||||
def get_properties(self) -> dict:
|
||||
"""
|
||||
Returns all other node properties apart from primary id and
|
||||
label as key-value pairs.
|
||||
|
||||
Returns:
|
||||
dict: properties
|
||||
"""
|
||||
return self.properties
|
||||
|
||||
def get_dict(self) -> dict:
|
||||
"""
|
||||
Return dict of id, labels, and properties.
|
||||
|
||||
Returns:
|
||||
dict: node_id and node_label as top-level key-value pairs,
|
||||
properties as second-level dict.
|
||||
"""
|
||||
return {
|
||||
"node_id": self.node_id,
|
||||
"node_label": self.node_label,
|
||||
"properties": self.properties,
|
||||
}
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class BioCypherEdge:
|
||||
"""
|
||||
Handoff class to represent biomedical relationships in Neo4j.
|
||||
|
||||
Has source and target ids, label, property dict; ids and label (in
|
||||
the Neo4j sense of a label, ie, the entity descriptor after the
|
||||
colon, such as ":TARGETS") are non-optional and called source_id,
|
||||
target_id, and relationship_label to avoid confusion with properties
|
||||
called "label", which usually denotes the human-readable form.
|
||||
Relationship labels are written in UPPERCASE and as verbs, as per
|
||||
Neo4j consensus.
|
||||
|
||||
Args:
|
||||
|
||||
source_id (string): consensus "best" id for biological entity
|
||||
|
||||
target_id (string): consensus "best" id for biological entity
|
||||
|
||||
relationship_label (string): type of interaction, UPPERCASE
|
||||
|
||||
properties (dict): collection of all other properties of the
|
||||
respective edge
|
||||
|
||||
"""
|
||||
|
||||
source_id: str
|
||||
target_id: str
|
||||
relationship_label: str
|
||||
relationship_id: str = None
|
||||
properties: dict = field(default_factory=dict)
|
||||
|
||||
def __post_init__(self):
|
||||
"""
|
||||
Check for reserved keywords.
|
||||
"""
|
||||
|
||||
if ":TYPE" in self.properties.keys():
|
||||
logger.debug(
|
||||
"Keyword ':TYPE' is reserved for Neo4j. "
|
||||
"Removing from properties.",
|
||||
# "Renaming to 'type'."
|
||||
)
|
||||
# self.properties["type"] = self.properties[":TYPE"]
|
||||
del self.properties[":TYPE"]
|
||||
elif "id" in self.properties.keys():
|
||||
logger.debug(
|
||||
"Keyword 'id' is reserved for Neo4j. "
|
||||
"Removing from properties.",
|
||||
# "Renaming to 'type'."
|
||||
)
|
||||
# self.properties["type"] = self.properties[":TYPE"]
|
||||
del self.properties["id"]
|
||||
elif "_ID" in self.properties.keys():
|
||||
logger.debug(
|
||||
"Keyword '_ID' is reserved for Postgres. "
|
||||
"Removing from properties.",
|
||||
# "Renaming to 'type'."
|
||||
)
|
||||
# self.properties["type"] = self.properties[":TYPE"]
|
||||
del self.properties["_ID"]
|
||||
|
||||
def get_id(self) -> Union[str, None]:
|
||||
"""
|
||||
Returns primary node identifier or None.
|
||||
|
||||
Returns:
|
||||
str: node_id
|
||||
"""
|
||||
|
||||
return self.relationship_id
|
||||
|
||||
def get_source_id(self) -> str:
|
||||
"""
|
||||
Returns primary node identifier of relationship source.
|
||||
|
||||
Returns:
|
||||
str: source_id
|
||||
"""
|
||||
return self.source_id
|
||||
|
||||
def get_target_id(self) -> str:
|
||||
"""
|
||||
Returns primary node identifier of relationship target.
|
||||
|
||||
Returns:
|
||||
str: target_id
|
||||
"""
|
||||
return self.target_id
|
||||
|
||||
def get_label(self) -> str:
|
||||
"""
|
||||
Returns relationship label.
|
||||
|
||||
Returns:
|
||||
str: relationship_label
|
||||
"""
|
||||
return self.relationship_label
|
||||
|
||||
def get_type(self) -> str:
|
||||
"""
|
||||
Returns relationship label.
|
||||
|
||||
Returns:
|
||||
str: relationship_label
|
||||
"""
|
||||
return self.relationship_label
|
||||
|
||||
def get_properties(self) -> dict:
|
||||
"""
|
||||
Returns all other relationship properties apart from primary ids
|
||||
and label as key-value pairs.
|
||||
|
||||
Returns:
|
||||
dict: properties
|
||||
"""
|
||||
return self.properties
|
||||
|
||||
def get_dict(self) -> dict:
|
||||
"""
|
||||
Return dict of ids, label, and properties.
|
||||
|
||||
Returns:
|
||||
dict: source_id, target_id and relationship_label as
|
||||
top-level key-value pairs, properties as second-level
|
||||
dict.
|
||||
"""
|
||||
return {
|
||||
"relationship_id": self.relationship_id or None,
|
||||
"source_id": self.source_id,
|
||||
"target_id": self.target_id,
|
||||
"relationship_label": self.relationship_label,
|
||||
"properties": self.properties,
|
||||
}
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class BioCypherRelAsNode:
|
||||
"""
|
||||
Class to represent relationships as nodes (with in- and outgoing
|
||||
edges) as a triplet of a BioCypherNode and two BioCypherEdges. Main
|
||||
usage in type checking (instances where the receiving function needs
|
||||
to check whether it receives a relationship as a single edge or as
|
||||
a triplet).
|
||||
|
||||
Args:
|
||||
|
||||
node (BioCypherNode): node representing the relationship
|
||||
|
||||
source_edge (BioCypherEdge): edge representing the source of the
|
||||
relationship
|
||||
|
||||
target_edge (BioCypherEdge): edge representing the target of the
|
||||
relationship
|
||||
|
||||
"""
|
||||
|
||||
node: BioCypherNode
|
||||
source_edge: BioCypherEdge
|
||||
target_edge: BioCypherEdge
|
||||
|
||||
def __post_init__(self):
|
||||
if not isinstance(self.node, BioCypherNode):
|
||||
raise TypeError(
|
||||
f"BioCypherRelAsNode.node must be a BioCypherNode, "
|
||||
f"not {type(self.node)}.",
|
||||
)
|
||||
|
||||
if not isinstance(self.source_edge, BioCypherEdge):
|
||||
raise TypeError(
|
||||
f"BioCypherRelAsNode.source_edge must be a BioCypherEdge, "
|
||||
f"not {type(self.source_edge)}.",
|
||||
)
|
||||
|
||||
if not isinstance(self.target_edge, BioCypherEdge):
|
||||
raise TypeError(
|
||||
f"BioCypherRelAsNode.target_edge must be a BioCypherEdge, "
|
||||
f"not {type(self.target_edge)}.",
|
||||
)
|
||||
|
||||
def get_node(self) -> BioCypherNode:
|
||||
return self.node
|
||||
|
||||
def get_source_edge(self) -> BioCypherEdge:
|
||||
return self.source_edge
|
||||
|
||||
def get_target_edge(self) -> BioCypherEdge:
|
||||
return self.target_edge
|
Reference in New Issue
Block a user