#!/usr/bin/env python # # Copyright 2021, Heidelberg University Clinic # # File author(s): Sebastian Lobentanzer # ... # # Distributed under MIT licence, see the file `LICENSE`. # """ BioCypher core module. Interfaces with the user and distributes tasks to submodules. """ from typing import Optional from datetime import datetime import os import json from more_itertools import peekable import yaml import pandas as pd from ._logger import logger logger.debug(f"Loading module {__name__}.") from ._get import Downloader from ._config import config as _config from ._config import update_from_file as _file_update from ._create import BioCypherEdge, BioCypherNode, BioCypherRelAsNode from ._mapping import OntologyMapping from ._ontology import Ontology from ._translate import Translator from ._deduplicate import Deduplicator from .output.in_memory._pandas import Pandas from .output.write._get_writer import DBMS_TO_CLASS, get_writer from .output.connect._neo4j_driver import get_driver __all__ = ["BioCypher"] SUPPORTED_DBMS = DBMS_TO_CLASS.keys() REQUIRED_CONFIG = [ "dbms", "offline", "strict_mode", "head_ontology", ] class BioCypher: """ Orchestration of BioCypher operations. Instantiate this class to interact with BioCypher. Args: dbms (str): The database management system to use. For supported systems see SUPPORTED_DBMS. offline (bool): Whether to run in offline mode. If True, no connection to the database will be made. strict_mode (bool): Whether to run in strict mode. If True, the translator will raise an error if a node or edge does not provide source, version, and licence information. biocypher_config_path (str): Path to the BioCypher config file. schema_config_path (str): Path to the user schema config file. head_ontology (dict): The head ontology defined by URL ('url') and root node ('root_node'). tail_ontologies (dict): The tail ontologies defined by URL and join nodes for both head and tail ontology. output_directory (str): Path to the output directory. If not provided, the default value 'biocypher-out' will be used. """ def __init__( self, dbms: str = None, offline: bool = None, strict_mode: bool = None, biocypher_config_path: str = None, schema_config_path: str = None, head_ontology: dict = None, tail_ontologies: dict = None, output_directory: str = None, cache_directory: str = None, # legacy params db_name: str = None, ): # Update configuration if custom path is provided if biocypher_config_path: _file_update(biocypher_config_path) if db_name: logger.warning( "The parameter `db_name` is deprecated. Please set the " "`database_name` setting in the `biocypher_config.yaml` file " "instead." ) _config(**{db_name: {"database_name": db_name}}) # Load configuration self.base_config = _config("biocypher") # Check for required configuration for key in REQUIRED_CONFIG: if key not in self.base_config: raise ValueError(f"Configuration key {key} is required.") # Set configuration - mandatory self._dbms = dbms or self.base_config["dbms"] if offline is None: self._offline = self.base_config["offline"] else: self._offline = offline if strict_mode is None: self._strict_mode = self.base_config["strict_mode"] else: self._strict_mode = strict_mode self._schema_config_path = schema_config_path or self.base_config.get( "schema_config_path" ) if not self._schema_config_path: logger.warning("Running BioCypher without schema configuration.") else: logger.info( f"Running BioCypher with schema configuration from {self._schema_config_path}." ) self._head_ontology = head_ontology or self.base_config["head_ontology"] # Set configuration - optional self._output_directory = output_directory or self.base_config.get( "output_directory" ) self._cache_directory = cache_directory or self.base_config.get( "cache_directory" ) self._tail_ontologies = tail_ontologies or self.base_config.get( "tail_ontologies" ) if self._dbms not in SUPPORTED_DBMS: raise ValueError( f"DBMS {self._dbms} not supported. " f"Please select from {SUPPORTED_DBMS}." ) # Initialize self._ontology_mapping = None self._deduplicator = None self._translator = None self._downloader = None self._ontology = None self._writer = None self._pd = None def _get_deduplicator(self) -> Deduplicator: """ Create deduplicator if not exists and return. """ if not self._deduplicator: self._deduplicator = Deduplicator() return self._deduplicator def _get_ontology_mapping(self) -> OntologyMapping: """ Create ontology mapping if not exists and return. """ if not self._schema_config_path: self._ontology_mapping = OntologyMapping() if not self._ontology_mapping: self._ontology_mapping = OntologyMapping( config_file=self._schema_config_path, ) return self._ontology_mapping def _get_ontology(self) -> Ontology: """ Create ontology if not exists and return. """ if not self._ontology: self._ontology = Ontology( ontology_mapping=self._get_ontology_mapping(), head_ontology=self._head_ontology, tail_ontologies=self._tail_ontologies, ) return self._ontology def _get_translator(self) -> Translator: """ Create translator if not exists and return. """ if not self._translator: self._translator = Translator( ontology=self._get_ontology(), strict_mode=self._strict_mode, ) return self._translator def _get_writer(self): """ Create writer if not online. Set as instance variable `self._writer`. """ if self._offline: timestamp = lambda: datetime.now().strftime("%Y%m%d%H%M%S") outdir = self._output_directory or os.path.join( "biocypher-out", timestamp() ) self._output_directory = os.path.abspath(outdir) self._writer = get_writer( dbms=self._dbms, translator=self._get_translator(), deduplicator=self._get_deduplicator(), output_directory=self._output_directory, strict_mode=self._strict_mode, ) else: raise NotImplementedError("Cannot get writer in online mode.") def _get_driver(self): """ Create driver if not exists. Set as instance variable `self._driver`. """ if not self._offline: self._driver = get_driver( dbms=self._dbms, translator=self._get_translator(), deduplicator=self._get_deduplicator(), ) else: raise NotImplementedError("Cannot get driver in offline mode.") def write_nodes( self, nodes, batch_size: int = int(1e6), force: bool = False ) -> bool: """ Write nodes to database. Either takes an iterable of tuples (if given, translates to ``BioCypherNode`` objects) or an iterable of ``BioCypherNode`` objects. Args: nodes (iterable): An iterable of nodes to write to the database. batch_size (int): The batch size to use when writing to disk. force (bool): Whether to force writing to the output directory even if the node type is not present in the schema config file. Returns: bool: True if successful. """ if not self._writer: self._get_writer() nodes = peekable(nodes) if not isinstance(nodes.peek(), BioCypherNode): tnodes = self._translator.translate_nodes(nodes) else: tnodes = nodes # write node files return self._writer.write_nodes( tnodes, batch_size=batch_size, force=force ) def write_edges(self, edges, batch_size: int = int(1e6)) -> bool: """ Write edges to database. Either takes an iterable of tuples (if given, translates to ``BioCypherEdge`` objects) or an iterable of ``BioCypherEdge`` objects. Args: edges (iterable): An iterable of edges to write to the database. Returns: bool: True if successful. """ if not self._writer: self._get_writer() edges = peekable(edges) if not isinstance(edges.peek(), BioCypherEdge): tedges = self._translator.translate_edges(edges) else: tedges = edges # write edge files return self._writer.write_edges(tedges, batch_size=batch_size) def to_df(self) -> list[pd.DataFrame]: """ Convert entities to a pandas DataFrame for each entity type and return a list. Args: entities (iterable): An iterable of entities to convert to a DataFrame. Returns: pd.DataFrame: A pandas DataFrame. """ if not self._pd: raise ValueError( "No pandas instance found. Please call `add()` first." ) return self._pd.dfs def add(self, entities) -> None: """ Function to add entities to the in-memory database. Accepts an iterable of tuples (if given, translates to ``BioCypherNode`` or ``BioCypherEdge`` objects) or an iterable of ``BioCypherNode`` or ``BioCypherEdge`` objects. Args: entities (iterable): An iterable of entities to add to the database. Can be 3-tuples (nodes) or 5-tuples (edges); also accepts 4-tuples for edges (deprecated). Returns: None """ if not self._pd: self._pd = Pandas( translator=self._get_translator(), deduplicator=self._get_deduplicator(), ) entities = peekable(entities) if ( isinstance(entities.peek(), BioCypherNode) or isinstance(entities.peek(), BioCypherEdge) or isinstance(entities.peek(), BioCypherRelAsNode) ): tentities = entities elif len(entities.peek()) < 4: tentities = self._translator.translate_nodes(entities) else: tentities = self._translator.translate_edges(entities) self._pd.add_tables(tentities) def add_nodes(self, nodes) -> None: """ Wrapper for ``add()`` to add nodes to the in-memory database. Args: nodes (iterable): An iterable of node tuples to add to the database. Returns: None """ self.add(nodes) def add_edges(self, edges) -> None: """ Wrapper for ``add()`` to add edges to the in-memory database. Args: edges (iterable): An iterable of edge tuples to add to the database. Returns: None """ self.add(edges) def merge_nodes(self, nodes) -> bool: """ Merge nodes into database. Either takes an iterable of tuples (if given, translates to ``BioCypherNode`` objects) or an iterable of ``BioCypherNode`` objects. Args: nodes (iterable): An iterable of nodes to merge into the database. Returns: bool: True if successful. """ if not self._driver: self._get_driver() nodes = peekable(nodes) if not isinstance(nodes.peek(), BioCypherNode): tnodes = self._translator.translate_nodes(nodes) else: tnodes = nodes # write node files return self._driver.add_biocypher_nodes(tnodes) def merge_edges(self, edges) -> bool: """ Merge edges into database. Either takes an iterable of tuples (if given, translates to ``BioCypherEdge`` objects) or an iterable of ``BioCypherEdge`` objects. Args: edges (iterable): An iterable of edges to merge into the database. Returns: bool: True if successful. """ if not self._driver: self._get_driver() edges = peekable(edges) if not isinstance(edges.peek(), BioCypherEdge): tedges = self._translator.translate_edges(edges) else: tedges = edges # write edge files return self._driver.add_biocypher_edges(tedges) # DOWNLOAD AND CACHE MANAGEMENT METHODS ### def _get_downloader(self, cache_dir: Optional[str] = None): """ Create downloader if not exists. """ if not self._downloader: self._downloader = Downloader(self._cache_directory) def download(self, *resources) -> None: """ Use the :class:`Downloader` class to download or load from cache the resources given by the adapter. """ self._get_downloader() return self._downloader.download(*resources) # OVERVIEW AND CONVENIENCE METHODS ### def log_missing_input_labels(self) -> Optional[dict[str, list[str]]]: """ Get the set of input labels encountered without an entry in the `schema_config.yaml` and print them to the logger. Returns: Optional[Dict[str, List[str]]]: A dictionary of Biolink types encountered without an entry in the `schema_config.yaml` file. """ mt = self._translator.get_missing_biolink_types() if mt: msg = ( "Input entities not accounted for due to them not being " f"present in the schema configuration file {self._schema_config_path} " "(this is not necessarily a problem, if you did not intend " "to include them in the database; see the log for details): \n" ) for k, v in mt.items(): msg += f" {k}: {v} \n" logger.info(msg) return mt else: logger.info("No missing labels in input.") return None def log_duplicates(self) -> None: """ Get the set of duplicate nodes and edges encountered and print them to the logger. """ dn = self._deduplicator.get_duplicate_nodes() if dn: ntypes = dn[0] nids = dn[1] msg = "Duplicate node types encountered (IDs in log): \n" for typ in ntypes: msg += f" {typ}\n" logger.info(msg) idmsg = "Duplicate node IDs encountered: \n" for _id in nids: idmsg += f" {_id}\n" logger.debug(idmsg) else: logger.info("No duplicate nodes in input.") de = self._deduplicator.get_duplicate_edges() if de: etypes = de[0] eids = de[1] msg = "Duplicate edge types encountered (IDs in log): \n" for typ in etypes: msg += f" {typ}\n" logger.info(msg) idmsg = "Duplicate edge IDs encountered: \n" for _id in eids: idmsg += f" {_id}\n" logger.debug(idmsg) else: logger.info("No duplicate edges in input.") def show_ontology_structure(self, **kwargs) -> None: """ Show the ontology structure using treelib or write to GRAPHML file. Args: to_disk (str): If specified, the ontology structure will be saved to disk as a GRAPHML file, to be opened in your favourite graph visualisation tool. full (bool): If True, the full ontology structure will be shown, including all nodes and edges. If False, only the nodes and edges that are relevant to the extended schema will be shown. """ if not self._ontology: self._get_ontology() return self._ontology.show_ontology_structure(**kwargs) def write_import_call(self) -> str: """ Write a shell script to import the database depending on the chosen DBMS. Returns: str: path toward the file holding the import call. """ if not self._offline: raise NotImplementedError( "Cannot write import call in online mode." ) return self._writer.write_import_call() def write_schema_info(self, as_node: bool = False) -> None: """ Write an extended schema info YAML file that extends the `schema_config.yaml` with run-time information of the built KG. For instance, include information on whether something present in the actual knowledge graph, whether it is a relationship (which is important in the case of representing relationships as nodes) and the actual sources and targets of edges. Since this file can be used in place of the original `schema_config.yaml` file, it indicates that it is the extended schema by setting `is_schema_info` to `true`. We start by using the `extended_schema` dictionary from the ontology class instance, which contains all expanded entities and relationships. The information of whether something is a relationship can be gathered from the deduplicator instance, which keeps track of all entities that have been seen. """ if not self._offline: raise NotImplementedError( "Cannot write schema info in online mode." ) ontology = self._get_ontology() schema = ontology.mapping.extended_schema.copy() schema["is_schema_info"] = True deduplicator = self._get_deduplicator() for node in deduplicator.entity_types: if node in schema.keys(): schema[node]["present_in_knowledge_graph"] = True schema[node]["is_relationship"] = False else: logger.info( f"Node {node} not present in extended schema. " "Skipping schema info." ) # find 'label_as_edge' cases in schema entries changed_labels = {} for k, v in schema.items(): if not isinstance(v, dict): continue if "label_as_edge" in v.keys(): if v["label_as_edge"] in deduplicator.seen_relationships.keys(): changed_labels[v["label_as_edge"]] = k for edge in deduplicator.seen_relationships.keys(): if edge in changed_labels.keys(): edge = changed_labels[edge] if edge in schema.keys(): schema[edge]["present_in_knowledge_graph"] = True schema[edge]["is_relationship"] = True # TODO information about source and target nodes else: logger.info( f"Edge {edge} not present in extended schema. " "Skipping schema info." ) # write to output directory as YAML file path = os.path.join(self._output_directory, "schema_info.yaml") with open(path, "w") as f: f.write(yaml.dump(schema)) if as_node: # write as node node = BioCypherNode( node_id="schema_info", node_label="schema_info", properties={"schema_info": json.dumps(schema)}, ) self.write_nodes([node], force=True) # override import call with added schema info node self.write_import_call() return schema # TRANSLATION METHODS ### def translate_term(self, term: str) -> str: """ Translate a term to its BioCypher equivalent. Args: term (str): The term to translate. Returns: str: The BioCypher equivalent of the term. """ # instantiate adapter if not exists self.start_ontology() return self._translator.translate_term(term) def summary(self) -> None: """ Wrapper for showing ontology structure and logging duplicates and missing input types. """ self.show_ontology_structure() self.log_duplicates() self.log_missing_input_labels() def reverse_translate_term(self, term: str) -> str: """ Reverse translate a term from its BioCypher equivalent. Args: term (str): The BioCypher term to reverse translate. Returns: str: The original term. """ # instantiate adapter if not exists self.start_ontology() return self._translator.reverse_translate_term(term) def translate_query(self, query: str) -> str: """ Translate a query to its BioCypher equivalent. Args: query (str): The query to translate. Returns: str: The BioCypher equivalent of the query. """ # instantiate adapter if not exists self.start_ontology() return self._translator.translate(query) def reverse_translate_query(self, query: str) -> str: """ Reverse translate a query from its BioCypher equivalent. Args: query (str): The BioCypher query to reverse translate. Returns: str: The original query. """ # instantiate adapter if not exists self.start_ontology() return self._translator.reverse_translate(query)