201 lines
7.3 KiB
Python
201 lines
7.3 KiB
Python
from abc import ABC, abstractmethod
|
|
from typing import Union, Optional
|
|
from collections.abc import Iterable
|
|
import os
|
|
|
|
from biocypher._create import BioCypherEdge, BioCypherNode, BioCypherRelAsNode
|
|
from biocypher._logger import logger
|
|
from biocypher._translate import Translator
|
|
from biocypher._deduplicate import Deduplicator
|
|
|
|
__all__ = ["_Writer"]
|
|
|
|
|
|
class _Writer(ABC):
|
|
"""Abstract class for writing node and edge representations to disk.
|
|
Specifics of the different writers (e.g. neo4j, postgresql, csv, etc.)
|
|
are implemented in the child classes. Any concrete writer needs to
|
|
implement at least:
|
|
- _write_node_data
|
|
- _write_edge_data
|
|
- _construct_import_call
|
|
- _get_import_script_name
|
|
|
|
Args:
|
|
translator (Translator): Instance of :py:class:`Translator` to enable translation of
|
|
nodes and manipulation of properties.
|
|
deduplicator (Deduplicator): Instance of :py:class:`Deduplicator` to enable deduplication
|
|
of nodes and edges.
|
|
output_directory (str, optional): Path for exporting CSV files. Defaults to None.
|
|
strict_mode (bool, optional): Whether to enforce source, version, and license properties. Defaults to False.
|
|
strict_mode (bool, optional): Whether to enforce source, version, and license properties. Defaults to False.
|
|
|
|
Raises:
|
|
NotImplementedError: Writer implementation must override '_write_node_data'
|
|
NotImplementedError: Writer implementation must override '_write_edge_data'
|
|
NotImplementedError: Writer implementation must override '_construct_import_call'
|
|
NotImplementedError: Writer implementation must override '_get_import_script_name'
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
translator: Translator,
|
|
deduplicator: Deduplicator,
|
|
output_directory: Optional[str] = None,
|
|
strict_mode: bool = False,
|
|
*args,
|
|
**kwargs,
|
|
):
|
|
"""Abstract class for writing node and edge representations to disk.
|
|
|
|
Args:
|
|
translator (Translator): Instance of :py:class:`Translator` to enable translation of
|
|
nodes and manipulation of properties.
|
|
deduplicator (Deduplicator): Instance of :py:class:`Deduplicator` to enable deduplication
|
|
of nodes and edges.
|
|
output_directory (str, optional): Path for exporting CSV files. Defaults to None.
|
|
strict_mode (bool, optional): Whether to enforce source, version, and license properties. Defaults to False.
|
|
strict_mode (bool, optional): Whether to enforce source, version, and license properties. Defaults to False.
|
|
"""
|
|
self.translator = translator
|
|
self.deduplicator = deduplicator
|
|
self.strict_mode = strict_mode
|
|
self.output_directory = output_directory
|
|
|
|
if os.path.exists(self.output_directory):
|
|
if kwargs.get("write_to_file", True):
|
|
logger.warning(
|
|
f"Output directory `{self.output_directory}` already exists. "
|
|
"If this is not planned, file consistency may be compromised."
|
|
)
|
|
else:
|
|
logger.info(f"Creating output directory `{self.output_directory}`.")
|
|
os.makedirs(self.output_directory)
|
|
|
|
@abstractmethod
|
|
def _write_node_data(
|
|
self,
|
|
nodes: Iterable[
|
|
Union[BioCypherNode, BioCypherEdge, BioCypherRelAsNode]
|
|
],
|
|
) -> bool:
|
|
"""Implement how to output.write nodes to disk.
|
|
|
|
Args:
|
|
nodes (Iterable): An iterable of BioCypherNode / BioCypherEdge / BioCypherRelAsNode objects.
|
|
|
|
Returns:
|
|
bool: The return value. True for success, False otherwise.
|
|
"""
|
|
raise NotImplementedError(
|
|
"Writer implementation must override 'write_nodes'"
|
|
)
|
|
|
|
@abstractmethod
|
|
def _write_edge_data(
|
|
self,
|
|
edges: Iterable[
|
|
Union[BioCypherNode, BioCypherEdge, BioCypherRelAsNode]
|
|
],
|
|
) -> bool:
|
|
"""Implement how to output.write edges to disk.
|
|
|
|
Args:
|
|
edges (Iterable): An iterable of BioCypherNode / BioCypherEdge / BioCypherRelAsNode objects.
|
|
|
|
Returns:
|
|
bool: The return value. True for success, False otherwise.
|
|
"""
|
|
raise NotImplementedError(
|
|
"Writer implementation must override 'write_edges'"
|
|
)
|
|
|
|
@abstractmethod
|
|
def _construct_import_call(self) -> str:
|
|
"""
|
|
Function to construct the import call detailing folder and
|
|
individual node and edge headers and data files, as well as
|
|
delimiters and database name. Built after all data has been
|
|
processed to ensure that nodes are called before any edges.
|
|
|
|
Returns:
|
|
str: command for importing the output files into a DBMS.
|
|
"""
|
|
raise NotImplementedError(
|
|
"Writer implementation must override '_construct_import_call'"
|
|
)
|
|
|
|
@abstractmethod
|
|
def _get_import_script_name(self) -> str:
|
|
"""Returns the name of the import script.
|
|
|
|
Returns:
|
|
str: The name of the import script (ending in .sh)
|
|
"""
|
|
raise NotImplementedError(
|
|
"Writer implementation must override '_get_import_script_name'"
|
|
)
|
|
|
|
def write_nodes(
|
|
self, nodes, batch_size: int = int(1e6), force: bool = False
|
|
):
|
|
"""Wrapper for writing nodes.
|
|
|
|
Args:
|
|
nodes (BioCypherNode): a list or generator of nodes in
|
|
:py:class:`BioCypherNode` format
|
|
batch_size (int): The batch size for writing nodes.
|
|
force (bool): Whether to force writing nodes even if their type is
|
|
not present in the schema.
|
|
|
|
Returns:
|
|
bool: The return value. True for success, False otherwise.
|
|
"""
|
|
passed = self._write_node_data(nodes)
|
|
if not passed:
|
|
logger.error("Error while writing node data.")
|
|
return False
|
|
return True
|
|
|
|
def write_edges(
|
|
self, edges, batch_size: int = int(1e6), force: bool = False
|
|
):
|
|
"""Wrapper for writing edges.
|
|
|
|
Args:
|
|
nodes (BioCypherNode): a list or generator of nodes in
|
|
:py:class:`BioCypherNode` format
|
|
batch_size (int): The batch size for writing nodes.
|
|
force (bool): Whether to force writing nodes even if their type is
|
|
not present in the schema.
|
|
|
|
Returns:
|
|
bool: The return value. True for success, False otherwise.
|
|
"""
|
|
passed = self._write_edge_data(edges)
|
|
if not passed:
|
|
logger.error("Error while writing edge data.")
|
|
return False
|
|
return True
|
|
|
|
def write_import_call(self):
|
|
"""
|
|
Function to output.write the import call detailing folder and
|
|
individual node and edge headers and data files, as well as
|
|
delimiters and database name, to the export folder as txt.
|
|
|
|
Returns:
|
|
str: The path of the file holding the import call.
|
|
"""
|
|
file_path = os.path.join(
|
|
self.output_directory, self._get_import_script_name()
|
|
)
|
|
logger.info(
|
|
f"Writing {self.__class__.__name__} import call to `{file_path}`."
|
|
)
|
|
|
|
with open(file_path, "w", encoding="utf-8") as f:
|
|
f.write(self._construct_import_call())
|
|
|
|
return file_path
|