2025-04-16 22:12:19 +02:00

423 lines
13 KiB
Python

#!/usr/bin/env python
#
# Copyright 2021, Heidelberg University Clinic
#
# File author(s): Sebastian Lobentanzer
# ...
#
# Distributed under MIT licence, see the file `LICENSE`.
#
"""
BioCypher 'online' mode. Handles connection and manipulation of a running DBMS.
"""
import subprocess
from biocypher._logger import logger
logger.debug(f"Loading module {__name__}.")
from collections.abc import Iterable
import itertools
import neo4j_utils
from biocypher import _misc
from biocypher._config import config as _config
from biocypher._create import BioCypherEdge, BioCypherNode
from biocypher._translate import Translator
__all__ = ["_Neo4jDriver"]
class _Neo4jDriver:
"""
Manages a BioCypher connection to a Neo4j database using the
``neo4j_utils.Driver`` class.
Args:
database_name (str): The name of the database to connect to.
wipe (bool): Whether to wipe the database before importing.
uri (str): The URI of the database.
user (str): The username to use for authentication.
password (str): The password to use for authentication.
multi_db (bool): Whether to use multi-database mode.
fetch_size (int): The number of records to fetch at a time.
increment_version (bool): Whether to increment the version number.
translator (Translator): The translator to use for mapping.
"""
def __init__(
self,
database_name: str,
uri: str,
user: str,
password: str,
multi_db: bool,
translator: Translator,
wipe: bool = False,
fetch_size: int = 1000,
increment_version: bool = True,
):
self.translator = translator
self._driver = neo4j_utils.Driver(
db_name=database_name,
db_uri=uri,
db_user=user,
db_passwd=password,
fetch_size=fetch_size,
wipe=wipe,
multi_db=multi_db,
raise_errors=True,
)
# check for biocypher config in connected graph
if wipe:
self.init_db()
if increment_version:
# set new current version node
self._update_meta_graph()
def _update_meta_graph(self):
logger.info("Updating Neo4j meta graph.")
# find current version node
db_version = self._driver.query(
"MATCH (v:BioCypher) " "WHERE NOT (v)-[:PRECEDES]->() " "RETURN v",
)
# add version node
self.add_biocypher_nodes(self.translator.ontology)
# connect version node to previous
if db_version[0]:
previous = db_version[0][0]
previous_id = previous["v"]["id"]
e_meta = BioCypherEdge(
previous_id,
self.translator.ontology.get_dict().get("node_id"),
"PRECEDES",
)
self.add_biocypher_edges(e_meta)
def init_db(self):
"""
Used to initialise a property graph database by setting up new
constraints. Wipe has been performed by the ``neo4j_utils.Driver``
class` already.
Todo:
- set up constraint creation interactively depending on the
need of the database
"""
logger.info("Initialising database.")
self._create_constraints()
def _create_constraints(self):
"""
Creates constraints on node types in the graph. Used for
initial setup.
Grabs leaves of the ``schema_config.yaml`` file and creates
constraints on the id of all entities represented as nodes.
"""
logger.info("Creating constraints for node types in config.")
major_neo4j_version = int(self._get_neo4j_version().split(".")[0])
# get structure
for leaf in self.translator.ontology.mapping.extended_schema.items():
label = _misc.sentencecase_to_pascalcase(leaf[0], sep=r"\s\.")
if leaf[1]["represented_as"] == "node":
if major_neo4j_version >= 5:
s = (
f"CREATE CONSTRAINT `{label}_id` "
f"IF NOT EXISTS FOR (n:`{label}`) "
"REQUIRE n.id IS UNIQUE"
)
self._driver.query(s)
else:
s = (
f"CREATE CONSTRAINT `{label}_id` "
f"IF NOT EXISTS ON (n:`{label}`) "
"ASSERT n.id IS UNIQUE"
)
self._driver.query(s)
def _get_neo4j_version(self):
"""Get neo4j version."""
try:
neo4j_version = self._driver.query(
"""
CALL dbms.components()
YIELD name, versions, edition
UNWIND versions AS version
RETURN version AS version
""",
)[0][0]["version"]
return neo4j_version
except Exception as e:
logger.warning(
f"Error detecting Neo4j version: {e} use default version 4.0.0."
)
return "4.0.0"
def add_nodes(self, id_type_tuples: Iterable[tuple]) -> tuple:
"""
Generic node adder method to add any kind of input to the graph via the
:class:`biocypher.create.BioCypherNode` class. Employs translation
functionality and calls the :meth:`add_biocypher_nodes()` method.
Args:
id_type_tuples (iterable of 3-tuple): for each node to add to
the biocypher graph, a 3-tuple with the following layout:
first, the (unique if constrained) ID of the node; second, the
type of the node, capitalised or PascalCase and in noun form
(Neo4j primary label, eg `:Protein`); and third, a dictionary
of arbitrary properties the node should possess (can be empty).
Returns:
2-tuple: the query result of :meth:`add_biocypher_nodes()`
- first entry: data
- second entry: Neo4j summary.
"""
bn = self.translator.translate_nodes(id_type_tuples)
return self.add_biocypher_nodes(bn)
def add_edges(self, id_src_tar_type_tuples: Iterable[tuple]) -> tuple:
"""
Generic edge adder method to add any kind of input to the graph
via the :class:`biocypher.create.BioCypherEdge` class. Employs
translation functionality and calls the
:meth:`add_biocypher_edges()` method.
Args:
id_src_tar_type_tuples (iterable of 5-tuple):
for each edge to add to the biocypher graph, a 5-tuple
with the following layout: first, the optional unique ID
of the interaction. This can be `None` if there is no
systematic identifier (which for many interactions is
the case). Second and third, the (unique if constrained)
IDs of the source and target nodes of the relationship;
fourth, the type of the relationship; and fifth, a
dictionary of arbitrary properties the edge should
possess (can be empty).
Returns:
2-tuple: the query result of :meth:`add_biocypher_edges()`
- first entry: data
- second entry: Neo4j summary.
"""
bn = self.translator.translate_edges(id_src_tar_type_tuples)
return self.add_biocypher_edges(bn)
def add_biocypher_nodes(
self,
nodes: Iterable[BioCypherNode],
explain: bool = False,
profile: bool = False,
) -> bool:
"""
Accepts a node type handoff class
(:class:`biocypher.create.BioCypherNode`) with id,
label, and a dict of properties (passing on the type of
property, ie, ``int``, ``str``, ...).
The dict retrieved by the
:meth:`biocypher.create.BioCypherNode.get_dict()` method is
passed into Neo4j as a map of maps, explicitly encoding node id
and label, and adding all other properties from the 'properties'
key of the dict. The merge is performed via APOC, matching only
on node id to prevent duplicates. The same properties are set on
match and on create, irrespective of the actual event.
Args:
nodes:
An iterable of :class:`biocypher.create.BioCypherNode` objects.
explain:
Call ``EXPLAIN`` on the CYPHER query.
profile:
Do profiling on the CYPHER query.
Returns:
True for success, False otherwise.
"""
try:
nodes = _misc.to_list(nodes)
entities = [node.get_dict() for node in nodes]
except AttributeError:
msg = "Nodes must have a `get_dict` method."
logger.error(msg)
raise ValueError(msg)
logger.info(f"Merging {len(entities)} nodes.")
entity_query = (
"UNWIND $entities AS ent "
"CALL apoc.merge.node([ent.node_label], "
"{id: ent.node_id}, ent.properties, ent.properties) "
"YIELD node "
"RETURN node"
)
method = "explain" if explain else "profile" if profile else "query"
result = getattr(self._driver, method)(
entity_query,
parameters={
"entities": entities,
},
)
logger.info("Finished merging nodes.")
return result
def add_biocypher_edges(
self,
edges: Iterable[BioCypherEdge],
explain: bool = False,
profile: bool = False,
) -> bool:
"""
Accepts an edge type handoff class
(:class:`biocypher.create.BioCypherEdge`) with source
and target ids, label, and a dict of properties (passing on the
type of property, ie, int, string ...).
The individual edge is either passed as a singleton, in the case
of representation as an edge in the graph, or as a 4-tuple, in
the case of representation as a node (with two edges connecting
to interaction partners).
The dict retrieved by the
:meth:`biocypher.create.BioCypherEdge.get_dict()` method is
passed into Neo4j as a map of maps, explicitly encoding source
and target ids and the relationship label, and adding all edge
properties from the 'properties' key of the dict. The merge is
performed via APOC, matching only on source and target id to
prevent duplicates. The same properties are set on match and on
create, irrespective of the actual event.
Args:
edges:
An iterable of :class:`biocypher.create.BioCypherEdge` objects.
explain:
Call ``EXPLAIN`` on the CYPHER query.
profile:
Do profiling on the CYPHER query.
Returns:
`True` for success, `False` otherwise.
"""
edges = _misc.ensure_iterable(edges)
edges = itertools.chain(*(_misc.ensure_iterable(i) for i in edges))
nodes = []
rels = []
try:
for e in edges:
if hasattr(e, "get_node"):
nodes.append(e.get_node())
rels.append(e.get_source_edge().get_dict())
rels.append(e.get_target_edge().get_dict())
else:
rels.append(e.get_dict())
except AttributeError:
msg = "Edges and nodes must have a `get_dict` method."
logger.error(msg)
raise ValueError(msg)
self.add_biocypher_nodes(nodes)
logger.info(f"Merging {len(rels)} edges.")
# cypher query
# merging only on the ids of the entities, passing the
# properties on match and on create;
# TODO add node labels?
node_query = (
"UNWIND $rels AS r "
"MERGE (src {id: r.source_id}) "
"MERGE (tar {id: r.target_id}) "
)
self._driver.query(node_query, parameters={"rels": rels})
edge_query = (
"UNWIND $rels AS r "
"MATCH (src {id: r.source_id}) "
"MATCH (tar {id: r.target_id}) "
"WITH src, tar, r "
"CALL apoc.merge.relationship"
"(src, r.relationship_label, NULL, "
"r.properties, tar, r.properties) "
"YIELD rel "
"RETURN rel"
)
method = "explain" if explain else "profile" if profile else "query"
result = getattr(self._driver, method)(
edge_query, parameters={"rels": rels}
)
logger.info("Finished merging edges.")
return result
def get_driver(
dbms: str,
translator: "Translator",
):
"""
Function to return the writer class.
Returns:
class: the writer class
"""
dbms_config = _config(dbms)
if dbms == "neo4j":
return _Neo4jDriver(
database_name=dbms_config["database_name"],
wipe=dbms_config["wipe"],
uri=dbms_config["uri"],
user=dbms_config["user"],
password=dbms_config["password"],
multi_db=dbms_config["multi_db"],
translator=translator,
)
return None