2025-04-16 22:12:19 +02:00

735 lines
22 KiB
Python

#!/usr/bin/env python
#
# Copyright 2021, Heidelberg University Clinic
#
# File author(s): Sebastian Lobentanzer
# ...
#
# Distributed under MIT licence, see the file `LICENSE`.
#
"""
BioCypher core module. Interfaces with the user and distributes tasks to
submodules.
"""
from typing import Optional
from datetime import datetime
import os
import json
from more_itertools import peekable
import yaml
import pandas as pd
from ._logger import logger
logger.debug(f"Loading module {__name__}.")
from ._get import Downloader
from ._config import config as _config
from ._config import update_from_file as _file_update
from ._create import BioCypherEdge, BioCypherNode, BioCypherRelAsNode
from ._mapping import OntologyMapping
from ._ontology import Ontology
from ._translate import Translator
from ._deduplicate import Deduplicator
from .output.in_memory._pandas import Pandas
from .output.write._get_writer import DBMS_TO_CLASS, get_writer
from .output.connect._neo4j_driver import get_driver
__all__ = ["BioCypher"]
SUPPORTED_DBMS = DBMS_TO_CLASS.keys()
REQUIRED_CONFIG = [
"dbms",
"offline",
"strict_mode",
"head_ontology",
]
class BioCypher:
"""
Orchestration of BioCypher operations. Instantiate this class to interact
with BioCypher.
Args:
dbms (str): The database management system to use. For supported
systems see SUPPORTED_DBMS.
offline (bool): Whether to run in offline mode. If True, no
connection to the database will be made.
strict_mode (bool): Whether to run in strict mode. If True, the
translator will raise an error if a node or edge does not
provide source, version, and licence information.
biocypher_config_path (str): Path to the BioCypher config file.
schema_config_path (str): Path to the user schema config
file.
head_ontology (dict): The head ontology defined by URL ('url') and root
node ('root_node').
tail_ontologies (dict): The tail ontologies defined by URL and
join nodes for both head and tail ontology.
output_directory (str): Path to the output directory. If not
provided, the default value 'biocypher-out' will be used.
"""
def __init__(
self,
dbms: str = None,
offline: bool = None,
strict_mode: bool = None,
biocypher_config_path: str = None,
schema_config_path: str = None,
head_ontology: dict = None,
tail_ontologies: dict = None,
output_directory: str = None,
cache_directory: str = None,
# legacy params
db_name: str = None,
):
# Update configuration if custom path is provided
if biocypher_config_path:
_file_update(biocypher_config_path)
if db_name:
logger.warning(
"The parameter `db_name` is deprecated. Please set the "
"`database_name` setting in the `biocypher_config.yaml` file "
"instead."
)
_config(**{db_name: {"database_name": db_name}})
# Load configuration
self.base_config = _config("biocypher")
# Check for required configuration
for key in REQUIRED_CONFIG:
if key not in self.base_config:
raise ValueError(f"Configuration key {key} is required.")
# Set configuration - mandatory
self._dbms = dbms or self.base_config["dbms"]
if offline is None:
self._offline = self.base_config["offline"]
else:
self._offline = offline
if strict_mode is None:
self._strict_mode = self.base_config["strict_mode"]
else:
self._strict_mode = strict_mode
self._schema_config_path = schema_config_path or self.base_config.get(
"schema_config_path"
)
if not self._schema_config_path:
logger.warning("Running BioCypher without schema configuration.")
else:
logger.info(
f"Running BioCypher with schema configuration from {self._schema_config_path}."
)
self._head_ontology = head_ontology or self.base_config["head_ontology"]
# Set configuration - optional
self._output_directory = output_directory or self.base_config.get(
"output_directory"
)
self._cache_directory = cache_directory or self.base_config.get(
"cache_directory"
)
self._tail_ontologies = tail_ontologies or self.base_config.get(
"tail_ontologies"
)
if self._dbms not in SUPPORTED_DBMS:
raise ValueError(
f"DBMS {self._dbms} not supported. "
f"Please select from {SUPPORTED_DBMS}."
)
# Initialize
self._ontology_mapping = None
self._deduplicator = None
self._translator = None
self._downloader = None
self._ontology = None
self._writer = None
self._pd = None
def _get_deduplicator(self) -> Deduplicator:
"""
Create deduplicator if not exists and return.
"""
if not self._deduplicator:
self._deduplicator = Deduplicator()
return self._deduplicator
def _get_ontology_mapping(self) -> OntologyMapping:
"""
Create ontology mapping if not exists and return.
"""
if not self._schema_config_path:
self._ontology_mapping = OntologyMapping()
if not self._ontology_mapping:
self._ontology_mapping = OntologyMapping(
config_file=self._schema_config_path,
)
return self._ontology_mapping
def _get_ontology(self) -> Ontology:
"""
Create ontology if not exists and return.
"""
if not self._ontology:
self._ontology = Ontology(
ontology_mapping=self._get_ontology_mapping(),
head_ontology=self._head_ontology,
tail_ontologies=self._tail_ontologies,
)
return self._ontology
def _get_translator(self) -> Translator:
"""
Create translator if not exists and return.
"""
if not self._translator:
self._translator = Translator(
ontology=self._get_ontology(),
strict_mode=self._strict_mode,
)
return self._translator
def _get_writer(self):
"""
Create writer if not online. Set as instance variable `self._writer`.
"""
if self._offline:
timestamp = lambda: datetime.now().strftime("%Y%m%d%H%M%S")
outdir = self._output_directory or os.path.join(
"biocypher-out", timestamp()
)
self._output_directory = os.path.abspath(outdir)
self._writer = get_writer(
dbms=self._dbms,
translator=self._get_translator(),
deduplicator=self._get_deduplicator(),
output_directory=self._output_directory,
strict_mode=self._strict_mode,
)
else:
raise NotImplementedError("Cannot get writer in online mode.")
def _get_driver(self):
"""
Create driver if not exists. Set as instance variable `self._driver`.
"""
if not self._offline:
self._driver = get_driver(
dbms=self._dbms,
translator=self._get_translator(),
deduplicator=self._get_deduplicator(),
)
else:
raise NotImplementedError("Cannot get driver in offline mode.")
def write_nodes(
self, nodes, batch_size: int = int(1e6), force: bool = False
) -> bool:
"""
Write nodes to database. Either takes an iterable of tuples (if given,
translates to ``BioCypherNode`` objects) or an iterable of
``BioCypherNode`` objects.
Args:
nodes (iterable): An iterable of nodes to write to the database.
batch_size (int): The batch size to use when writing to disk.
force (bool): Whether to force writing to the output directory even
if the node type is not present in the schema config file.
Returns:
bool: True if successful.
"""
if not self._writer:
self._get_writer()
nodes = peekable(nodes)
if not isinstance(nodes.peek(), BioCypherNode):
tnodes = self._translator.translate_nodes(nodes)
else:
tnodes = nodes
# write node files
return self._writer.write_nodes(
tnodes, batch_size=batch_size, force=force
)
def write_edges(self, edges, batch_size: int = int(1e6)) -> bool:
"""
Write edges to database. Either takes an iterable of tuples (if given,
translates to ``BioCypherEdge`` objects) or an iterable of
``BioCypherEdge`` objects.
Args:
edges (iterable): An iterable of edges to write to the database.
Returns:
bool: True if successful.
"""
if not self._writer:
self._get_writer()
edges = peekable(edges)
if not isinstance(edges.peek(), BioCypherEdge):
tedges = self._translator.translate_edges(edges)
else:
tedges = edges
# write edge files
return self._writer.write_edges(tedges, batch_size=batch_size)
def to_df(self) -> list[pd.DataFrame]:
"""
Convert entities to a pandas DataFrame for each entity type and return
a list.
Args:
entities (iterable): An iterable of entities to convert to a
DataFrame.
Returns:
pd.DataFrame: A pandas DataFrame.
"""
if not self._pd:
raise ValueError(
"No pandas instance found. Please call `add()` first."
)
return self._pd.dfs
def add(self, entities) -> None:
"""
Function to add entities to the in-memory database. Accepts an iterable
of tuples (if given, translates to ``BioCypherNode`` or
``BioCypherEdge`` objects) or an iterable of ``BioCypherNode`` or
``BioCypherEdge`` objects.
Args:
entities (iterable): An iterable of entities to add to the database.
Can be 3-tuples (nodes) or 5-tuples (edges); also accepts
4-tuples for edges (deprecated).
Returns:
None
"""
if not self._pd:
self._pd = Pandas(
translator=self._get_translator(),
deduplicator=self._get_deduplicator(),
)
entities = peekable(entities)
if (
isinstance(entities.peek(), BioCypherNode)
or isinstance(entities.peek(), BioCypherEdge)
or isinstance(entities.peek(), BioCypherRelAsNode)
):
tentities = entities
elif len(entities.peek()) < 4:
tentities = self._translator.translate_nodes(entities)
else:
tentities = self._translator.translate_edges(entities)
self._pd.add_tables(tentities)
def add_nodes(self, nodes) -> None:
"""
Wrapper for ``add()`` to add nodes to the in-memory database.
Args:
nodes (iterable): An iterable of node tuples to add to the database.
Returns:
None
"""
self.add(nodes)
def add_edges(self, edges) -> None:
"""
Wrapper for ``add()`` to add edges to the in-memory database.
Args:
edges (iterable): An iterable of edge tuples to add to the database.
Returns:
None
"""
self.add(edges)
def merge_nodes(self, nodes) -> bool:
"""
Merge nodes into database. Either takes an iterable of tuples (if given,
translates to ``BioCypherNode`` objects) or an iterable of
``BioCypherNode`` objects.
Args:
nodes (iterable): An iterable of nodes to merge into the database.
Returns:
bool: True if successful.
"""
if not self._driver:
self._get_driver()
nodes = peekable(nodes)
if not isinstance(nodes.peek(), BioCypherNode):
tnodes = self._translator.translate_nodes(nodes)
else:
tnodes = nodes
# write node files
return self._driver.add_biocypher_nodes(tnodes)
def merge_edges(self, edges) -> bool:
"""
Merge edges into database. Either takes an iterable of tuples (if given,
translates to ``BioCypherEdge`` objects) or an iterable of
``BioCypherEdge`` objects.
Args:
edges (iterable): An iterable of edges to merge into the database.
Returns:
bool: True if successful.
"""
if not self._driver:
self._get_driver()
edges = peekable(edges)
if not isinstance(edges.peek(), BioCypherEdge):
tedges = self._translator.translate_edges(edges)
else:
tedges = edges
# write edge files
return self._driver.add_biocypher_edges(tedges)
# DOWNLOAD AND CACHE MANAGEMENT METHODS ###
def _get_downloader(self, cache_dir: Optional[str] = None):
"""
Create downloader if not exists.
"""
if not self._downloader:
self._downloader = Downloader(self._cache_directory)
def download(self, *resources) -> None:
"""
Use the :class:`Downloader` class to download or load from cache the
resources given by the adapter.
"""
self._get_downloader()
return self._downloader.download(*resources)
# OVERVIEW AND CONVENIENCE METHODS ###
def log_missing_input_labels(self) -> Optional[dict[str, list[str]]]:
"""
Get the set of input labels encountered without an entry in the
`schema_config.yaml` and print them to the logger.
Returns:
Optional[Dict[str, List[str]]]: A dictionary of Biolink types
encountered without an entry in the `schema_config.yaml` file.
"""
mt = self._translator.get_missing_biolink_types()
if mt:
msg = (
"Input entities not accounted for due to them not being "
f"present in the schema configuration file {self._schema_config_path} "
"(this is not necessarily a problem, if you did not intend "
"to include them in the database; see the log for details): \n"
)
for k, v in mt.items():
msg += f" {k}: {v} \n"
logger.info(msg)
return mt
else:
logger.info("No missing labels in input.")
return None
def log_duplicates(self) -> None:
"""
Get the set of duplicate nodes and edges encountered and print them to
the logger.
"""
dn = self._deduplicator.get_duplicate_nodes()
if dn:
ntypes = dn[0]
nids = dn[1]
msg = "Duplicate node types encountered (IDs in log): \n"
for typ in ntypes:
msg += f" {typ}\n"
logger.info(msg)
idmsg = "Duplicate node IDs encountered: \n"
for _id in nids:
idmsg += f" {_id}\n"
logger.debug(idmsg)
else:
logger.info("No duplicate nodes in input.")
de = self._deduplicator.get_duplicate_edges()
if de:
etypes = de[0]
eids = de[1]
msg = "Duplicate edge types encountered (IDs in log): \n"
for typ in etypes:
msg += f" {typ}\n"
logger.info(msg)
idmsg = "Duplicate edge IDs encountered: \n"
for _id in eids:
idmsg += f" {_id}\n"
logger.debug(idmsg)
else:
logger.info("No duplicate edges in input.")
def show_ontology_structure(self, **kwargs) -> None:
"""
Show the ontology structure using treelib or write to GRAPHML file.
Args:
to_disk (str): If specified, the ontology structure will be saved
to disk as a GRAPHML file, to be opened in your favourite
graph visualisation tool.
full (bool): If True, the full ontology structure will be shown,
including all nodes and edges. If False, only the nodes and
edges that are relevant to the extended schema will be shown.
"""
if not self._ontology:
self._get_ontology()
return self._ontology.show_ontology_structure(**kwargs)
def write_import_call(self) -> str:
"""
Write a shell script to import the database depending on the chosen
DBMS.
Returns:
str: path toward the file holding the import call.
"""
if not self._offline:
raise NotImplementedError(
"Cannot write import call in online mode."
)
return self._writer.write_import_call()
def write_schema_info(self, as_node: bool = False) -> None:
"""
Write an extended schema info YAML file that extends the
`schema_config.yaml` with run-time information of the built KG. For
instance, include information on whether something present in the actual
knowledge graph, whether it is a relationship (which is important in the
case of representing relationships as nodes) and the actual sources and
targets of edges. Since this file can be used in place of the original
`schema_config.yaml` file, it indicates that it is the extended schema
by setting `is_schema_info` to `true`.
We start by using the `extended_schema` dictionary from the ontology
class instance, which contains all expanded entities and relationships.
The information of whether something is a relationship can be gathered
from the deduplicator instance, which keeps track of all entities that
have been seen.
"""
if not self._offline:
raise NotImplementedError(
"Cannot write schema info in online mode."
)
ontology = self._get_ontology()
schema = ontology.mapping.extended_schema.copy()
schema["is_schema_info"] = True
deduplicator = self._get_deduplicator()
for node in deduplicator.entity_types:
if node in schema.keys():
schema[node]["present_in_knowledge_graph"] = True
schema[node]["is_relationship"] = False
else:
logger.info(
f"Node {node} not present in extended schema. "
"Skipping schema info."
)
# find 'label_as_edge' cases in schema entries
changed_labels = {}
for k, v in schema.items():
if not isinstance(v, dict):
continue
if "label_as_edge" in v.keys():
if v["label_as_edge"] in deduplicator.seen_relationships.keys():
changed_labels[v["label_as_edge"]] = k
for edge in deduplicator.seen_relationships.keys():
if edge in changed_labels.keys():
edge = changed_labels[edge]
if edge in schema.keys():
schema[edge]["present_in_knowledge_graph"] = True
schema[edge]["is_relationship"] = True
# TODO information about source and target nodes
else:
logger.info(
f"Edge {edge} not present in extended schema. "
"Skipping schema info."
)
# write to output directory as YAML file
path = os.path.join(self._output_directory, "schema_info.yaml")
with open(path, "w") as f:
f.write(yaml.dump(schema))
if as_node:
# write as node
node = BioCypherNode(
node_id="schema_info",
node_label="schema_info",
properties={"schema_info": json.dumps(schema)},
)
self.write_nodes([node], force=True)
# override import call with added schema info node
self.write_import_call()
return schema
# TRANSLATION METHODS ###
def translate_term(self, term: str) -> str:
"""
Translate a term to its BioCypher equivalent.
Args:
term (str): The term to translate.
Returns:
str: The BioCypher equivalent of the term.
"""
# instantiate adapter if not exists
self.start_ontology()
return self._translator.translate_term(term)
def summary(self) -> None:
"""
Wrapper for showing ontology structure and logging duplicates and
missing input types.
"""
self.show_ontology_structure()
self.log_duplicates()
self.log_missing_input_labels()
def reverse_translate_term(self, term: str) -> str:
"""
Reverse translate a term from its BioCypher equivalent.
Args:
term (str): The BioCypher term to reverse translate.
Returns:
str: The original term.
"""
# instantiate adapter if not exists
self.start_ontology()
return self._translator.reverse_translate_term(term)
def translate_query(self, query: str) -> str:
"""
Translate a query to its BioCypher equivalent.
Args:
query (str): The query to translate.
Returns:
str: The BioCypher equivalent of the query.
"""
# instantiate adapter if not exists
self.start_ontology()
return self._translator.translate(query)
def reverse_translate_query(self, query: str) -> str:
"""
Reverse translate a query from its BioCypher equivalent.
Args:
query (str): The BioCypher query to reverse translate.
Returns:
str: The original query.
"""
# instantiate adapter if not exists
self.start_ontology()
return self._translator.reverse_translate(query)