release commit

This commit is contained in:
2025-04-16 22:12:19 +02:00
commit a9db0be88a
89 changed files with 2336827 additions and 0 deletions

View File

View File

View File

@ -0,0 +1,422 @@
#!/usr/bin/env python
#
# Copyright 2021, Heidelberg University Clinic
#
# File author(s): Sebastian Lobentanzer
# ...
#
# Distributed under MIT licence, see the file `LICENSE`.
#
"""
BioCypher 'online' mode. Handles connection and manipulation of a running DBMS.
"""
import subprocess
from biocypher._logger import logger
logger.debug(f"Loading module {__name__}.")
from collections.abc import Iterable
import itertools
import neo4j_utils
from biocypher import _misc
from biocypher._config import config as _config
from biocypher._create import BioCypherEdge, BioCypherNode
from biocypher._translate import Translator
__all__ = ["_Neo4jDriver"]
class _Neo4jDriver:
"""
Manages a BioCypher connection to a Neo4j database using the
``neo4j_utils.Driver`` class.
Args:
database_name (str): The name of the database to connect to.
wipe (bool): Whether to wipe the database before importing.
uri (str): The URI of the database.
user (str): The username to use for authentication.
password (str): The password to use for authentication.
multi_db (bool): Whether to use multi-database mode.
fetch_size (int): The number of records to fetch at a time.
increment_version (bool): Whether to increment the version number.
translator (Translator): The translator to use for mapping.
"""
def __init__(
self,
database_name: str,
uri: str,
user: str,
password: str,
multi_db: bool,
translator: Translator,
wipe: bool = False,
fetch_size: int = 1000,
increment_version: bool = True,
):
self.translator = translator
self._driver = neo4j_utils.Driver(
db_name=database_name,
db_uri=uri,
db_user=user,
db_passwd=password,
fetch_size=fetch_size,
wipe=wipe,
multi_db=multi_db,
raise_errors=True,
)
# check for biocypher config in connected graph
if wipe:
self.init_db()
if increment_version:
# set new current version node
self._update_meta_graph()
def _update_meta_graph(self):
logger.info("Updating Neo4j meta graph.")
# find current version node
db_version = self._driver.query(
"MATCH (v:BioCypher) " "WHERE NOT (v)-[:PRECEDES]->() " "RETURN v",
)
# add version node
self.add_biocypher_nodes(self.translator.ontology)
# connect version node to previous
if db_version[0]:
previous = db_version[0][0]
previous_id = previous["v"]["id"]
e_meta = BioCypherEdge(
previous_id,
self.translator.ontology.get_dict().get("node_id"),
"PRECEDES",
)
self.add_biocypher_edges(e_meta)
def init_db(self):
"""
Used to initialise a property graph database by setting up new
constraints. Wipe has been performed by the ``neo4j_utils.Driver``
class` already.
Todo:
- set up constraint creation interactively depending on the
need of the database
"""
logger.info("Initialising database.")
self._create_constraints()
def _create_constraints(self):
"""
Creates constraints on node types in the graph. Used for
initial setup.
Grabs leaves of the ``schema_config.yaml`` file and creates
constraints on the id of all entities represented as nodes.
"""
logger.info("Creating constraints for node types in config.")
major_neo4j_version = int(self._get_neo4j_version().split(".")[0])
# get structure
for leaf in self.translator.ontology.mapping.extended_schema.items():
label = _misc.sentencecase_to_pascalcase(leaf[0], sep=r"\s\.")
if leaf[1]["represented_as"] == "node":
if major_neo4j_version >= 5:
s = (
f"CREATE CONSTRAINT `{label}_id` "
f"IF NOT EXISTS FOR (n:`{label}`) "
"REQUIRE n.id IS UNIQUE"
)
self._driver.query(s)
else:
s = (
f"CREATE CONSTRAINT `{label}_id` "
f"IF NOT EXISTS ON (n:`{label}`) "
"ASSERT n.id IS UNIQUE"
)
self._driver.query(s)
def _get_neo4j_version(self):
"""Get neo4j version."""
try:
neo4j_version = self._driver.query(
"""
CALL dbms.components()
YIELD name, versions, edition
UNWIND versions AS version
RETURN version AS version
""",
)[0][0]["version"]
return neo4j_version
except Exception as e:
logger.warning(
f"Error detecting Neo4j version: {e} use default version 4.0.0."
)
return "4.0.0"
def add_nodes(self, id_type_tuples: Iterable[tuple]) -> tuple:
"""
Generic node adder method to add any kind of input to the graph via the
:class:`biocypher.create.BioCypherNode` class. Employs translation
functionality and calls the :meth:`add_biocypher_nodes()` method.
Args:
id_type_tuples (iterable of 3-tuple): for each node to add to
the biocypher graph, a 3-tuple with the following layout:
first, the (unique if constrained) ID of the node; second, the
type of the node, capitalised or PascalCase and in noun form
(Neo4j primary label, eg `:Protein`); and third, a dictionary
of arbitrary properties the node should possess (can be empty).
Returns:
2-tuple: the query result of :meth:`add_biocypher_nodes()`
- first entry: data
- second entry: Neo4j summary.
"""
bn = self.translator.translate_nodes(id_type_tuples)
return self.add_biocypher_nodes(bn)
def add_edges(self, id_src_tar_type_tuples: Iterable[tuple]) -> tuple:
"""
Generic edge adder method to add any kind of input to the graph
via the :class:`biocypher.create.BioCypherEdge` class. Employs
translation functionality and calls the
:meth:`add_biocypher_edges()` method.
Args:
id_src_tar_type_tuples (iterable of 5-tuple):
for each edge to add to the biocypher graph, a 5-tuple
with the following layout: first, the optional unique ID
of the interaction. This can be `None` if there is no
systematic identifier (which for many interactions is
the case). Second and third, the (unique if constrained)
IDs of the source and target nodes of the relationship;
fourth, the type of the relationship; and fifth, a
dictionary of arbitrary properties the edge should
possess (can be empty).
Returns:
2-tuple: the query result of :meth:`add_biocypher_edges()`
- first entry: data
- second entry: Neo4j summary.
"""
bn = self.translator.translate_edges(id_src_tar_type_tuples)
return self.add_biocypher_edges(bn)
def add_biocypher_nodes(
self,
nodes: Iterable[BioCypherNode],
explain: bool = False,
profile: bool = False,
) -> bool:
"""
Accepts a node type handoff class
(:class:`biocypher.create.BioCypherNode`) with id,
label, and a dict of properties (passing on the type of
property, ie, ``int``, ``str``, ...).
The dict retrieved by the
:meth:`biocypher.create.BioCypherNode.get_dict()` method is
passed into Neo4j as a map of maps, explicitly encoding node id
and label, and adding all other properties from the 'properties'
key of the dict. The merge is performed via APOC, matching only
on node id to prevent duplicates. The same properties are set on
match and on create, irrespective of the actual event.
Args:
nodes:
An iterable of :class:`biocypher.create.BioCypherNode` objects.
explain:
Call ``EXPLAIN`` on the CYPHER query.
profile:
Do profiling on the CYPHER query.
Returns:
True for success, False otherwise.
"""
try:
nodes = _misc.to_list(nodes)
entities = [node.get_dict() for node in nodes]
except AttributeError:
msg = "Nodes must have a `get_dict` method."
logger.error(msg)
raise ValueError(msg)
logger.info(f"Merging {len(entities)} nodes.")
entity_query = (
"UNWIND $entities AS ent "
"CALL apoc.merge.node([ent.node_label], "
"{id: ent.node_id}, ent.properties, ent.properties) "
"YIELD node "
"RETURN node"
)
method = "explain" if explain else "profile" if profile else "query"
result = getattr(self._driver, method)(
entity_query,
parameters={
"entities": entities,
},
)
logger.info("Finished merging nodes.")
return result
def add_biocypher_edges(
self,
edges: Iterable[BioCypherEdge],
explain: bool = False,
profile: bool = False,
) -> bool:
"""
Accepts an edge type handoff class
(:class:`biocypher.create.BioCypherEdge`) with source
and target ids, label, and a dict of properties (passing on the
type of property, ie, int, string ...).
The individual edge is either passed as a singleton, in the case
of representation as an edge in the graph, or as a 4-tuple, in
the case of representation as a node (with two edges connecting
to interaction partners).
The dict retrieved by the
:meth:`biocypher.create.BioCypherEdge.get_dict()` method is
passed into Neo4j as a map of maps, explicitly encoding source
and target ids and the relationship label, and adding all edge
properties from the 'properties' key of the dict. The merge is
performed via APOC, matching only on source and target id to
prevent duplicates. The same properties are set on match and on
create, irrespective of the actual event.
Args:
edges:
An iterable of :class:`biocypher.create.BioCypherEdge` objects.
explain:
Call ``EXPLAIN`` on the CYPHER query.
profile:
Do profiling on the CYPHER query.
Returns:
`True` for success, `False` otherwise.
"""
edges = _misc.ensure_iterable(edges)
edges = itertools.chain(*(_misc.ensure_iterable(i) for i in edges))
nodes = []
rels = []
try:
for e in edges:
if hasattr(e, "get_node"):
nodes.append(e.get_node())
rels.append(e.get_source_edge().get_dict())
rels.append(e.get_target_edge().get_dict())
else:
rels.append(e.get_dict())
except AttributeError:
msg = "Edges and nodes must have a `get_dict` method."
logger.error(msg)
raise ValueError(msg)
self.add_biocypher_nodes(nodes)
logger.info(f"Merging {len(rels)} edges.")
# cypher query
# merging only on the ids of the entities, passing the
# properties on match and on create;
# TODO add node labels?
node_query = (
"UNWIND $rels AS r "
"MERGE (src {id: r.source_id}) "
"MERGE (tar {id: r.target_id}) "
)
self._driver.query(node_query, parameters={"rels": rels})
edge_query = (
"UNWIND $rels AS r "
"MATCH (src {id: r.source_id}) "
"MATCH (tar {id: r.target_id}) "
"WITH src, tar, r "
"CALL apoc.merge.relationship"
"(src, r.relationship_label, NULL, "
"r.properties, tar, r.properties) "
"YIELD rel "
"RETURN rel"
)
method = "explain" if explain else "profile" if profile else "query"
result = getattr(self._driver, method)(
edge_query, parameters={"rels": rels}
)
logger.info("Finished merging edges.")
return result
def get_driver(
dbms: str,
translator: "Translator",
):
"""
Function to return the writer class.
Returns:
class: the writer class
"""
dbms_config = _config(dbms)
if dbms == "neo4j":
return _Neo4jDriver(
database_name=dbms_config["database_name"],
wipe=dbms_config["wipe"],
uri=dbms_config["uri"],
user=dbms_config["user"],
password=dbms_config["password"],
multi_db=dbms_config["multi_db"],
translator=translator,
)
return None

View File

View File

@ -0,0 +1,90 @@
import pandas as pd
from biocypher._create import BioCypherEdge, BioCypherNode, BioCypherRelAsNode
class Pandas:
def __init__(self, translator, deduplicator):
self.translator = translator
self.deduplicator = deduplicator
self.dfs = {}
def _separate_entity_types(self, entities):
"""
Given mixed iterable of BioCypher objects, separate them into lists by
type. Also deduplicates using the `Deduplicator` instance.
"""
lists = {}
for entity in entities:
if (
not isinstance(entity, BioCypherNode)
and not isinstance(entity, BioCypherEdge)
and not isinstance(entity, BioCypherRelAsNode)
):
raise TypeError(
"Expected a BioCypherNode / BioCypherEdge / "
f"BioCypherRelAsNode, got {type(entity)}."
)
if isinstance(entity, BioCypherNode):
seen = self.deduplicator.node_seen(entity)
elif isinstance(entity, BioCypherEdge):
seen = self.deduplicator.edge_seen(entity)
elif isinstance(entity, BioCypherRelAsNode):
seen = self.deduplicator.rel_as_node_seen(entity)
if seen:
continue
if isinstance(entity, BioCypherRelAsNode):
node = entity.get_node()
source_edge = entity.get_source_edge()
target_edge = entity.get_target_edge()
_type = node.get_type()
if not _type in lists:
lists[_type] = []
lists[_type].append(node)
_source_type = source_edge.get_type()
if not _source_type in lists:
lists[_source_type] = []
lists[_source_type].append(source_edge)
_target_type = target_edge.get_type()
if not _target_type in lists:
lists[_target_type] = []
lists[_target_type].append(target_edge)
continue
_type = entity.get_type()
if not _type in lists:
lists[_type] = []
lists[_type].append(entity)
return lists
def add_tables(self, entities):
"""
Add Pandas dataframes for each node and edge type in the input.
"""
lists = self._separate_entity_types(entities)
for _type, _entities in lists.items():
self._add_entity_df(_type, _entities)
def _add_entity_df(self, _type, _entities):
df = pd.DataFrame(
pd.json_normalize([node.get_dict() for node in _entities])
)
# replace "properties." with "" in column names
df.columns = [col.replace("properties.", "") for col in df.columns]
if _type not in self.dfs:
self.dfs[_type] = df
else:
self.dfs[_type] = pd.concat(
[self.dfs[_type], df], ignore_index=True
)
return self.dfs[_type]

View File

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,113 @@
#!/usr/bin/env python
#
# Copyright 2021, Heidelberg University Clinic
#
# File author(s): Sebastian Lobentanzer
# Michael Hartung
#
# Distributed under MIT licence, see the file `LICENSE`.
#
"""
BioCypher 'offline' module. Handles the writing of node and edge representations
suitable for import into a DBMS.
"""
from biocypher._logger import logger
from biocypher.output.write.graph._rdf import _RDFWriter
from biocypher.output.write.graph._neo4j import _Neo4jBatchWriter
from biocypher.output.write.graph._arangodb import _ArangoDBBatchWriter
from biocypher.output.write.graph._networkx import _NetworkXWriter
from biocypher.output.write.relational._csv import _PandasCSVWriter
from biocypher.output.write.relational._sqlite import _SQLiteBatchWriter
from biocypher.output.write.relational._postgresql import _PostgreSQLBatchWriter
logger.debug(f"Loading module {__name__}.")
from typing import TYPE_CHECKING
from biocypher._config import config as _config
__all__ = ["get_writer", "DBMS_TO_CLASS"]
if TYPE_CHECKING:
from biocypher._translate import Translator
from biocypher._deduplicate import Deduplicator
DBMS_TO_CLASS = {
"neo": _Neo4jBatchWriter,
"neo4j": _Neo4jBatchWriter,
"Neo4j": _Neo4jBatchWriter,
"postgres": _PostgreSQLBatchWriter,
"postgresql": _PostgreSQLBatchWriter,
"PostgreSQL": _PostgreSQLBatchWriter,
"arango": _ArangoDBBatchWriter,
"arangodb": _ArangoDBBatchWriter,
"ArangoDB": _ArangoDBBatchWriter,
"sqlite": _SQLiteBatchWriter,
"sqlite3": _SQLiteBatchWriter,
"rdf": _RDFWriter,
"RDF": _RDFWriter,
"csv": _PandasCSVWriter,
"CSV": _PandasCSVWriter,
"pandas": _PandasCSVWriter,
"Pandas": _PandasCSVWriter,
"networkx": _NetworkXWriter,
"NetworkX": _NetworkXWriter,
}
def get_writer(
dbms: str,
translator: "Translator",
deduplicator: "Deduplicator",
output_directory: str,
strict_mode: bool,
):
"""
Function to return the writer class based on the selection in the config
file.
Args:
dbms: the database management system; for options, see DBMS_TO_CLASS.
translator: the Translator object.
deduplicator: the Deduplicator object.
output_directory: the directory to output.write the output files to.
strict_mode: whether to use strict mode.
Returns:
instance: an instance of the selected writer class.
"""
dbms_config = _config(dbms)
writer = DBMS_TO_CLASS[dbms]
if not writer:
raise ValueError(f"Unknown dbms: {dbms}")
if writer is not None:
return writer(
translator=translator,
deduplicator=deduplicator,
delimiter=dbms_config.get("delimiter"),
array_delimiter=dbms_config.get("array_delimiter"),
quote=dbms_config.get("quote_character"),
output_directory=output_directory,
db_name=dbms_config.get("database_name"),
import_call_bin_prefix=dbms_config.get("import_call_bin_prefix"),
import_call_file_prefix=dbms_config.get("import_call_file_prefix"),
wipe=dbms_config.get("wipe"),
strict_mode=strict_mode,
skip_bad_relationships=dbms_config.get(
"skip_bad_relationships"
), # neo4j
skip_duplicate_nodes=dbms_config.get(
"skip_duplicate_nodes"
), # neo4j
db_user=dbms_config.get("user"), # psql
db_password=dbms_config.get("password"), # psql
db_port=dbms_config.get("port"), # psql
rdf_format=dbms_config.get("rdf_format"), # rdf
rdf_namespaces=dbms_config.get("rdf_namespaces"), # rdf
)

View File

@ -0,0 +1,200 @@
from abc import ABC, abstractmethod
from typing import Union, Optional
from collections.abc import Iterable
import os
from biocypher._create import BioCypherEdge, BioCypherNode, BioCypherRelAsNode
from biocypher._logger import logger
from biocypher._translate import Translator
from biocypher._deduplicate import Deduplicator
__all__ = ["_Writer"]
class _Writer(ABC):
"""Abstract class for writing node and edge representations to disk.
Specifics of the different writers (e.g. neo4j, postgresql, csv, etc.)
are implemented in the child classes. Any concrete writer needs to
implement at least:
- _write_node_data
- _write_edge_data
- _construct_import_call
- _get_import_script_name
Args:
translator (Translator): Instance of :py:class:`Translator` to enable translation of
nodes and manipulation of properties.
deduplicator (Deduplicator): Instance of :py:class:`Deduplicator` to enable deduplication
of nodes and edges.
output_directory (str, optional): Path for exporting CSV files. Defaults to None.
strict_mode (bool, optional): Whether to enforce source, version, and license properties. Defaults to False.
strict_mode (bool, optional): Whether to enforce source, version, and license properties. Defaults to False.
Raises:
NotImplementedError: Writer implementation must override '_write_node_data'
NotImplementedError: Writer implementation must override '_write_edge_data'
NotImplementedError: Writer implementation must override '_construct_import_call'
NotImplementedError: Writer implementation must override '_get_import_script_name'
"""
def __init__(
self,
translator: Translator,
deduplicator: Deduplicator,
output_directory: Optional[str] = None,
strict_mode: bool = False,
*args,
**kwargs,
):
"""Abstract class for writing node and edge representations to disk.
Args:
translator (Translator): Instance of :py:class:`Translator` to enable translation of
nodes and manipulation of properties.
deduplicator (Deduplicator): Instance of :py:class:`Deduplicator` to enable deduplication
of nodes and edges.
output_directory (str, optional): Path for exporting CSV files. Defaults to None.
strict_mode (bool, optional): Whether to enforce source, version, and license properties. Defaults to False.
strict_mode (bool, optional): Whether to enforce source, version, and license properties. Defaults to False.
"""
self.translator = translator
self.deduplicator = deduplicator
self.strict_mode = strict_mode
self.output_directory = output_directory
if os.path.exists(self.output_directory):
if kwargs.get("write_to_file", True):
logger.warning(
f"Output directory `{self.output_directory}` already exists. "
"If this is not planned, file consistency may be compromised."
)
else:
logger.info(f"Creating output directory `{self.output_directory}`.")
os.makedirs(self.output_directory)
@abstractmethod
def _write_node_data(
self,
nodes: Iterable[
Union[BioCypherNode, BioCypherEdge, BioCypherRelAsNode]
],
) -> bool:
"""Implement how to output.write nodes to disk.
Args:
nodes (Iterable): An iterable of BioCypherNode / BioCypherEdge / BioCypherRelAsNode objects.
Returns:
bool: The return value. True for success, False otherwise.
"""
raise NotImplementedError(
"Writer implementation must override 'write_nodes'"
)
@abstractmethod
def _write_edge_data(
self,
edges: Iterable[
Union[BioCypherNode, BioCypherEdge, BioCypherRelAsNode]
],
) -> bool:
"""Implement how to output.write edges to disk.
Args:
edges (Iterable): An iterable of BioCypherNode / BioCypherEdge / BioCypherRelAsNode objects.
Returns:
bool: The return value. True for success, False otherwise.
"""
raise NotImplementedError(
"Writer implementation must override 'write_edges'"
)
@abstractmethod
def _construct_import_call(self) -> str:
"""
Function to construct the import call detailing folder and
individual node and edge headers and data files, as well as
delimiters and database name. Built after all data has been
processed to ensure that nodes are called before any edges.
Returns:
str: command for importing the output files into a DBMS.
"""
raise NotImplementedError(
"Writer implementation must override '_construct_import_call'"
)
@abstractmethod
def _get_import_script_name(self) -> str:
"""Returns the name of the import script.
Returns:
str: The name of the import script (ending in .sh)
"""
raise NotImplementedError(
"Writer implementation must override '_get_import_script_name'"
)
def write_nodes(
self, nodes, batch_size: int = int(1e6), force: bool = False
):
"""Wrapper for writing nodes.
Args:
nodes (BioCypherNode): a list or generator of nodes in
:py:class:`BioCypherNode` format
batch_size (int): The batch size for writing nodes.
force (bool): Whether to force writing nodes even if their type is
not present in the schema.
Returns:
bool: The return value. True for success, False otherwise.
"""
passed = self._write_node_data(nodes)
if not passed:
logger.error("Error while writing node data.")
return False
return True
def write_edges(
self, edges, batch_size: int = int(1e6), force: bool = False
):
"""Wrapper for writing edges.
Args:
nodes (BioCypherNode): a list or generator of nodes in
:py:class:`BioCypherNode` format
batch_size (int): The batch size for writing nodes.
force (bool): Whether to force writing nodes even if their type is
not present in the schema.
Returns:
bool: The return value. True for success, False otherwise.
"""
passed = self._write_edge_data(edges)
if not passed:
logger.error("Error while writing edge data.")
return False
return True
def write_import_call(self):
"""
Function to output.write the import call detailing folder and
individual node and edge headers and data files, as well as
delimiters and database name, to the export folder as txt.
Returns:
str: The path of the file holding the import call.
"""
file_path = os.path.join(
self.output_directory, self._get_import_script_name()
)
logger.info(
f"Writing {self.__class__.__name__} import call to `{file_path}`."
)
with open(file_path, "w", encoding="utf-8") as f:
f.write(self._construct_import_call())
return file_path

View File

View File

@ -0,0 +1,241 @@
import os
from biocypher._logger import logger
from biocypher.output.write.graph._neo4j import _Neo4jBatchWriter
class _ArangoDBBatchWriter(_Neo4jBatchWriter):
"""
Class for writing node and edge representations to disk using the format
specified by ArangoDB for the use of "arangoimport". Output files are
similar to Neo4j, but with a different header format.
"""
def _get_default_import_call_bin_prefix(self):
"""
Method to provide the default string for the import call bin prefix.
Returns:
str: The default location for the neo4j admin import location
"""
return ""
def _get_import_script_name(self) -> str:
"""
Returns the name of the neo4j admin import script
Returns:
str: The name of the import script (ending in .sh)
"""
return "arangodb-import-call.sh"
def _write_node_headers(self):
"""
Writes single CSV file for a graph entity that is represented
as a node as per the definition in the `schema_config.yaml`,
containing only the header for this type of node.
Returns:
bool: The return value. True for success, False otherwise.
"""
# load headers from data parse
if not self.node_property_dict:
logger.error(
"Header information not found. Was the data parsed first?",
)
return False
for label, props in self.node_property_dict.items():
# create header CSV with ID, properties, labels
_id = "_key"
# translate label to PascalCase
pascal_label = self.translator.name_sentence_to_pascal(label)
header = f"{pascal_label}-header.csv"
header_path = os.path.join(
self.outdir,
header,
)
# check if file already exists
if os.path.exists(header_path):
logger.warning(
f"File {header_path} already exists. Overwriting."
)
# concatenate key:value in props
props_list = []
for k in props.keys():
props_list.append(f"{k}")
# create list of lists and flatten
# removes need for empty check of property list
out_list = [[_id], props_list]
out_list = [val for sublist in out_list for val in sublist]
with open(header_path, "w", encoding="utf-8") as f:
# concatenate with delimiter
row = self.delim.join(out_list)
f.write(row)
# add collection from schema config
collection = self.translator.ontology.mapping.extended_schema[
label
].get("db_collection_name", None)
# add file path to neo4 admin import statement
# do once for each part file
parts = self.parts.get(label, [])
if not parts:
raise ValueError(
f"No parts found for node label {label}. "
f"Check that the data was parsed first.",
)
for part in parts:
import_call_header_path = os.path.join(
self.import_call_file_prefix,
header,
)
import_call_parts_path = os.path.join(
self.import_call_file_prefix,
part,
)
self.import_call_nodes.add(
(
import_call_header_path,
import_call_parts_path,
collection,
)
)
return True
def _write_edge_headers(self):
"""
Writes single CSV file for a graph entity that is represented
as an edge as per the definition in the `schema_config.yaml`,
containing only the header for this type of edge.
Returns:
bool: The return value. True for success, False otherwise.
"""
# load headers from data parse
if not self.edge_property_dict:
logger.error(
"Header information not found. Was the data parsed first?",
)
return False
for label, props in self.edge_property_dict.items():
# translate label to PascalCase
pascal_label = self.translator.name_sentence_to_pascal(label)
# paths
header = f"{pascal_label}-header.csv"
header_path = os.path.join(
self.outdir,
header,
)
parts = f"{pascal_label}-part.*"
# check for file exists
if os.path.exists(header_path):
logger.warning(
f"Header file {header_path} already exists. Overwriting."
)
# concatenate key:value in props
props_list = []
for k in props.keys():
props_list.append(f"{k}")
out_list = ["_from", "_key", *props_list, "_to"]
with open(header_path, "w", encoding="utf-8") as f:
# concatenate with delimiter
row = self.delim.join(out_list)
f.write(row)
# add collection from schema config
if not self.translator.ontology.mapping.extended_schema.get(label):
for (
_,
v,
) in self.translator.ontology.mapping.extended_schema.items():
if v.get("label_as_edge") == label:
collection = v.get("db_collection_name", None)
break
else:
collection = self.translator.ontology.mapping.extended_schema[
label
].get("db_collection_name", None)
# add file path to neo4 admin import statement (import call path
# may be different from actual output path)
header_import_call_path = os.path.join(
self.import_call_file_prefix,
header,
)
parts_import_call_path = os.path.join(
self.import_call_file_prefix,
parts,
)
self.import_call_edges.add(
(
header_import_call_path,
parts_import_call_path,
collection,
)
)
return True
def _construct_import_call(self) -> str:
"""
Function to construct the import call detailing folder and
individual node and edge headers and data files, as well as
delimiters and database name. Built after all data has been
processed to ensure that nodes are called before any edges.
Returns:
str: a bash command for neo4j-admin import
"""
import_call = (
f"{self.import_call_bin_prefix}arangoimp "
f"--type csv "
f'--separator="{self.escaped_delim}" '
)
if self.quote == "'":
import_call += f'--quote="{self.quote}" '
else:
import_call += f"--quote='{self.quote}' "
node_lines = ""
# node import calls: one line per node type
for header_path, parts_path, collection in self.import_call_nodes:
line = (
f"{import_call} "
f"--headers-file {header_path} "
f"--file= {parts_path} "
)
if collection:
line += f"--create-collection --collection {collection} "
node_lines += f"{line}\n"
edge_lines = ""
# edge import calls: one line per edge type
for header_path, parts_path, collection in self.import_call_edges:
import_call += f'--relationships="{header_path},{parts_path}" '
return node_lines + edge_lines

View File

@ -0,0 +1,502 @@
import os
import glob
import pandas as pd
from biocypher._logger import logger
from biocypher.output.write._batch_writer import parse_label, _BatchWriter
class _Neo4jBatchWriter(_BatchWriter):
"""
Class for writing node and edge representations to disk using the
format specified by Neo4j for the use of admin import. Each batch
writer instance has a fixed representation that needs to be passed
at instantiation via the :py:attr:`schema` argument. The instance
also expects an ontology adapter via :py:attr:`ontology_adapter` to be able
to convert and extend the hierarchy.
This class inherits from the abstract class "_BatchWriter" and implements the
Neo4j-specific methods:
- _write_node_headers
- _write_edge_headers
- _construct_import_call
- _write_array_string
"""
def __init__(self, *args, **kwargs):
"""
Constructor.
Check the version of Neo4j and adds a command scope if version >= 5.
Returns:
_Neo4jBatchWriter: An instance of the writer.
"""
# Should read the configuration and setup import_call_bin_prefix.
super().__init__(*args, **kwargs)
def _get_default_import_call_bin_prefix(self):
"""
Method to provide the default string for the import call bin prefix.
Returns:
str: The default location for the neo4j admin import location
"""
return "bin/"
def _write_array_string(self, string_list):
"""
Abstract method to output.write the string representation of an array into a .csv file
as required by the neo4j admin-import.
Args:
string_list (list): list of ontology strings
Returns:
str: The string representation of an array for the neo4j admin import
"""
string = self.adelim.join(string_list)
return f"{self.quote}{string}{self.quote}"
def _write_node_headers(self):
"""
Writes single CSV file for a graph entity that is represented
as a node as per the definition in the `schema_config.yaml`,
containing only the header for this type of node.
Returns:
bool: The return value. True for success, False otherwise.
"""
# load headers from data parse
if not self.node_property_dict:
logger.error(
"Header information not found. Was the data parsed first?",
)
return False
for label, props in self.node_property_dict.items():
_id = ":ID"
##MeDaX dev remark:
##From Fhir data we get case sensitive labels. E.g. 'Procedure' and 'procedure' are two distinct node types.
##Because we are converting Resources to more specific node classes using their "resourceType" attribute.
# translate label to PascalCase
pascal_label = self.translator.name_sentence_to_pascal(
parse_label(label)
)
header = f"{pascal_label}-header.csv"
header_path = os.path.join(
self.outdir,
header,
)
parts = f"{pascal_label}-part.*"
existing_header = False
# check if file already exists
if os.path.exists(header_path):
logger.warning(
f"Header file `{header_path}` already exists. Overwriting.",
)
with open(header_path, "r", encoding="utf-8") as existing:
existing_header = existing.read().strip().split(self.delim)
# concatenate key:value in props
props_list = []
for k, v in props.items():
if v in ["int", "long", "integer"]:
props_list.append(f"{k}:long")
elif v in ["int[]", "long[]", "integer[]"]:
props_list.append(f"{k}:long[]")
elif v in ["float", "double", "dbl"]:
props_list.append(f"{k}:double")
elif v in ["float[]", "double[]"]:
props_list.append(f"{k}:double[]")
elif v in ["bool", "boolean"]:
# TODO Neo4j boolean support / spelling?
props_list.append(f"{k}:boolean")
elif v in ["bool[]", "boolean[]"]:
props_list.append(f"{k}:boolean[]")
elif v in ["str[]", "string[]"]:
props_list.append(f"{k}:string[]")
else:
props_list.append(f"{k}")
# create list of lists and flatten
out_list = [[_id], props_list, [":LABEL"]]
out_list = [val for sublist in out_list for val in sublist]
with open(header_path, "w", encoding="utf-8") as f:
# Check if header file already exists and has different columns
if os.path.exists(header_path):
if existing_header:
#existing_header = existing.read().strip().split(self.delim)
# Compare existing and new headers
if set(existing_header) != set(out_list):
# Get part files associated with this header
base_name = os.path.basename(header_path).replace("-header.csv", "")
part_files = glob.glob(os.path.join(os.path.dirname(header_path), f"{base_name}-part*.csv"))
# Find the highest numbered part file without full sorting
highest_part = None
highest_number = -1
for part_file in part_files:
try:
# Extract number from filename (assuming format like "part123.csv")
file_name = os.path.basename(part_file)
number_part = file_name.split("part")[1].split(".")[0]
number = int(number_part)
if number > highest_number:
highest_number = number
highest_part = part_file
except (IndexError, ValueError):
# Skip files that don't match the expected pattern
continue
# Update each part file with the new columns
for part_file in part_files:
if part_file == highest_part:
print(f"Skipping the highest part file: {highest_part}")
continue
try:
#print("exi: ", existing_header)
#print("out: ", out_list)
df = self.adapt_csv_to_new_header(existing_header, out_list, part_file)
# Read the file without headers
# Write back to file WITHOUT including the header
df.to_csv(part_file, sep=self.delim, index=False, header=False)
print(f"Updated {part_file} with new columns in correct positions")
except Exception as e:
print(f"Error updating {part_file}: {e}")
# Write the new header
row = self.delim.join(out_list)
f.write(row)
# add file path to neo4 admin import statement (import call file
# path may be different from actual file path)
import_call_header_path = os.path.join(
self.import_call_file_prefix,
header,
)
import_call_parts_path = os.path.join(
self.import_call_file_prefix,
parts,
)
self.import_call_nodes.add(
(import_call_header_path, import_call_parts_path)
)
return True
def _write_edge_headers(self):
"""
Writes single CSV file for a graph entity that is represented
as an edge as per the definition in the `schema_config.yaml`,
containing only the header for this type of edge.
Returns:
bool: The return value. True for success, False otherwise.
"""
# load headers from data parse
if not self.edge_property_dict:
logger.error(
"Header information not found. Was the data parsed first?",
)
return False
for label, props in self.edge_property_dict.items():
# translate label to PascalCase
pascal_label = self.translator.name_sentence_to_pascal(
parse_label(label)
)
# paths
header = f"{pascal_label}-header.csv"
header_path = os.path.join(
self.outdir,
header,
)
parts = f"{pascal_label}-part.*"
# check for file exists
if os.path.exists(header_path):
logger.warning(
f"File {header_path} already exists. Overwriting."
)
# concatenate key:value in props
props_list = []
for k, v in props.items():
if v in ["int", "long", "integer"]:
props_list.append(f"{k}:long")
elif v in ["int[]", "long[]", "integer[]"]:
props_list.append(f"{k}:long[]")
elif v in ["float", "double"]:
props_list.append(f"{k}:double")
elif v in ["float[]", "double[]"]:
props_list.append(f"{k}:double[]")
elif v in [
"bool",
"boolean",
]: # TODO does Neo4j support bool?
props_list.append(f"{k}:boolean")
elif v in ["bool[]", "boolean[]"]:
props_list.append(f"{k}:boolean[]")
elif v in ["str[]", "string[]"]:
props_list.append(f"{k}:string[]")
else:
props_list.append(f"{k}")
skip_id = False
schema_label = None
if label in ["IS_SOURCE_OF", "IS_TARGET_OF", "IS_PART_OF"]:
skip_id = True
elif not self.translator.ontology.mapping.extended_schema.get(
label
):
# find label in schema by label_as_edge
for (
k,
v,
) in self.translator.ontology.mapping.extended_schema.items():
if v.get("label_as_edge") == label:
schema_label = k
break
else:
schema_label = label
out_list = [":START_ID"]
if schema_label:
if (
self.translator.ontology.mapping.extended_schema.get(
schema_label
).get("use_id")
== False
):
skip_id = True
if not skip_id:
out_list.append("id")
out_list.extend(props_list)
out_list.extend([":END_ID", ":TYPE"])
existing_header = False
# check if file already exists
if os.path.exists(header_path):
logger.warning(
f"Header file `{header_path}` already exists. Overwriting.",
)
with open(header_path, "r", encoding="utf-8") as existing:
existing_header = existing.read().strip().split(self.delim)
with open(header_path, "w", encoding="utf-8") as f:
# Check if header file already exists and has different columns
if os.path.exists(header_path):
if existing_header:
#existing_header = existing.read().strip().split(self.delim)
# Compare existing and new headers
if set(existing_header) != set(out_list):
# Get part files associated with this header
base_name = os.path.basename(header_path).replace("-header.csv", "")
part_files = glob.glob(os.path.join(os.path.dirname(header_path), f"{base_name}-part*.csv"))
# Find the highest numbered part file without full sorting
highest_part = None
highest_number = -1
for part_file in part_files:
try:
# Extract number from filename (assuming format like "part123.csv")
file_name = os.path.basename(part_file)
number_part = file_name.split("part")[1].split(".")[0]
number = int(number_part)
if number > highest_number:
highest_number = number
highest_part = part_file
except (IndexError, ValueError):
# Skip files that don't match the expected pattern
continue
# Update each part file with the new columns
for part_file in part_files:
if part_file == highest_part:
print(f"Skipping the highest part file: {highest_part}")
continue
try:
print("exi: ", existing_header)
print("out: ", out_list)
df = self.adapt_csv_to_new_header(existing_header, out_list, part_file)
# Read the file without headers
# Write back to file WITHOUT including the header
df.to_csv(part_file, sep=self.delim, index=False, header=False)
print(f"Updated {part_file} with new columns in correct positions")
except Exception as e:
print(f"Error updating {part_file}: {e}")
# Write the new header
row = self.delim.join(out_list)
f.write(row)
# add file path to neo4 admin import statement (import call file
# path may be different from actual file path)
import_call_header_path = os.path.join(
self.import_call_file_prefix,
header,
)
import_call_parts_path = os.path.join(
self.import_call_file_prefix,
parts,
)
self.import_call_edges.add(
(import_call_header_path, import_call_parts_path)
)
return True
def _get_import_script_name(self) -> str:
"""
Returns the name of the neo4j admin import script
Returns:
str: The name of the import script (ending in .sh)
"""
return "neo4j-admin-import-call.sh"
def _construct_import_call(self) -> str:
"""
Function to construct the import call detailing folder and
individual node and edge headers and data files, as well as
delimiters and database name. Built after all data has been
processed to ensure that nodes are called before any edges.
Returns:
str: a bash command for neo4j-admin import
"""
import_call_neo4j_v4 = self._get_import_call(
"import", "--database=", "--force="
)
import_call_neo4j_v5 = self._get_import_call(
"database import full", "", "--overwrite-destination="
)
neo4j_version_check = f"version=$({self._get_default_import_call_bin_prefix()}neo4j-admin --version | cut -d '.' -f 1)"
import_script = f"#!/bin/bash\n{neo4j_version_check}\nif [[ $version -ge 5 ]]; then\n\t{import_call_neo4j_v5}\nelse\n\t{import_call_neo4j_v4}\nfi"
return import_script
def _get_import_call(
self, import_cmd: str, database_cmd: str, wipe_cmd: str
) -> str:
"""Get parametrized import call for Neo4j 4 or 5+.
Args:
import_cmd (str): The import command to use.
database_cmd (str): The database command to use.
wipe_cmd (str): The wipe command to use.
Returns:
str: The import call.
"""
import_call = f"{self.import_call_bin_prefix}neo4j-admin {import_cmd} "
import_call += f"{database_cmd}{self.db_name} "
import_call += f'--delimiter="{self.escaped_delim}" '
import_call += f'--array-delimiter="{self.escaped_adelim}" '
if self.quote == "'":
import_call += f'--quote="{self.quote}" '
else:
import_call += f"--quote='{self.quote}' "
if self.wipe:
import_call += f"{wipe_cmd}true "
if self.skip_bad_relationships:
import_call += "--skip-bad-relationships=true "
if self.skip_duplicate_nodes:
import_call += "--skip-duplicate-nodes=true "
# append node import calls
for header_path, parts_path in self.import_call_nodes:
import_call += f'--nodes="{header_path},{parts_path}" '
# append edge import calls
for header_path, parts_path in self.import_call_edges:
import_call += f'--relationships="{header_path},{parts_path}" '
return import_call
def adapt_csv_to_new_header(self, old_header, new_header, csv_file_path):
"""
Adapt a CSV table to a new header structure, placing new columns in their correct positions.
Parameters:
old_header (list): The original header columns
new_header (list): The new header columns
csv_file_path (str): Path to the CSV file
Returns:
pandas.DataFrame: CSV data with the new header structure
"""
# Step 1: Read the CSV data without headers
df = pd.read_csv(csv_file_path, sep=self.delim, header=None)
# Step 2: If the file is empty, return empty DataFrame with new headers
if df.empty:
return pd.DataFrame(columns=new_header)
# Step 3: If column count doesn't match old_header length, handle the mismatch
if len(df.columns) != len(old_header):
print(f"Warning: CSV columns count ({len(df.columns)}) doesn't match the provided old header count ({len(old_header)})")
# If file has fewer columns than old_header, pad with NaN
if len(df.columns) < len(old_header):
for i in range(len(df.columns), len(old_header)):
df[i] = None
# If file has more columns than old_header, truncate
else:
df = df.iloc[:, :len(old_header)]
# Step 4: Assign old header names to the dataframe
df.columns = old_header
# Step 5: Create a new DataFrame with the correct structure
new_df = pd.DataFrame(columns=new_header)
# Step 6: For each column in the new header, find its position in the old header
for new_col_idx, new_col in enumerate(new_header):
if new_col in old_header:
# If column exists in old header, copy data
new_df[new_col] = df[new_col]
else:
# If new column, add empty column
new_df[new_col] = None
# Step 7: Ensure columns are in the exact order of new_header
new_df = new_df[new_header]
return new_df

View File

@ -0,0 +1,76 @@
import pickle
import networkx as nx
from biocypher._logger import logger
from biocypher.output.write._writer import _Writer
from biocypher.output.write.relational._csv import _PandasCSVWriter
class _NetworkXWriter(_Writer):
"""
Class for writing node and edges to a networkx DiGraph.
"""
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.csv_writer = _PandasCSVWriter(*args, write_to_file=False, **kwargs)
self.G = nx.DiGraph()
def _construct_import_call(self) -> str:
"""Function to construct the Python code to load all node and edge csv files again into Pandas dfs.
Returns:
str: Python code to load the csv files into Pandas dfs.
"""
logger.info(
f"Writing networkx {self.G} to pickle file networkx_graph.pkl."
)
with open(f"{self.output_directory}/networkx_graph.pkl", "wb") as f:
pickle.dump(self.G, f)
import_call = "import pickle\n"
import_call += "with open('./networkx_graph.pkl', 'rb') as f:\n\tG_loaded = pickle.load(f)"
return import_call
def _get_import_script_name(self) -> str:
"""Function to return the name of the import script."""
return "import_networkx.py"
def _write_node_data(self, nodes) -> bool:
passed = self.csv_writer._write_entities_to_file(nodes)
self.add_to_networkx()
return passed
def _write_edge_data(self, edges) -> bool:
passed = self.csv_writer._write_entities_to_file(edges)
self.add_to_networkx()
return passed
def add_to_networkx(self) -> bool:
all_dfs = self.csv_writer.stored_dfs
node_dfs = [
df
for df in all_dfs.values()
if df.columns.str.contains("node_id").any()
]
edge_dfs = [
df
for df in all_dfs.values()
if df.columns.str.contains("source_id").any()
and df.columns.str.contains("target_id").any()
]
for df in node_dfs:
nodes = df.set_index("node_id").to_dict(orient="index")
self.G.add_nodes_from(nodes.items())
for df in edge_dfs:
edges = df.set_index(["source_id", "target_id"]).to_dict(
orient="index"
)
self.G.add_edges_from(
(
(source, target, attrs)
for (source, target), attrs in edges.items()
)
)
return True

View File

@ -0,0 +1,515 @@
#!/usr/bin/env python
#
# Copyright 2021, Heidelberg University Clinic
#
# File author(s): Loes van den Biggelaar
# Sebastian Lobentanzer
#
# Distributed under MIT licence, see the file `LICENSE`.
#
"""
BioCypher 'offline' module. Handles the writing of node and edge representations
suitable for import into a DBMS.
"""
from types import GeneratorType
from typing import Union
import os
from rdflib import DC, RDF, RDFS, SKOS, DCTERMS, Graph, Literal, Namespace
from rdflib.namespace import (
_NAMESPACE_PREFIXES_CORE,
_NAMESPACE_PREFIXES_RDFLIB,
)
from biocypher._create import BioCypherEdge, BioCypherNode
from biocypher._logger import logger
from biocypher.output.write._batch_writer import _BatchWriter
class _RDFWriter(_BatchWriter):
"""
Class to write BioCypher's property graph into an RDF format using
rdflib and all the extensions it supports (RDF/XML, N3, NTriples,
N-Quads, Turtle, TriX, Trig and JSON-LD). By default the conversion
is done keeping only the minimum information about node and edges,
skipping all properties.
"""
def _get_import_script_name(self) -> str:
"""
Returns the name of the RDF admin import script.
This function applicable for RDF export.
Returns:
str: The name of the import script (ending in .sh)
"""
return "rdf-import-call.sh"
def _get_default_import_call_bin_prefix(self):
"""
Method to provide the default string for the import call bin prefix.
Returns:
str: The default location for the RDF admin import location
"""
return "bin/"
def _is_rdf_format_supported(self, rdf_format: str) -> bool:
"""
Function to check if the specified RDF format is supported.
Args:
rdf_format (str): The RDF format to check.
Returns:
bool: Returns True if rdf format supported, False otherwise.
"""
supported_formats = [
"xml",
"n3",
"turtle",
"nt",
"pretty-xml",
"trix",
"trig",
"nquads",
"json-ld",
]
if rdf_format not in supported_formats:
logger.error(
f"{rdf_format}; Incorrect or unsupported RDF format, use one of the following: "
f'"xml", "n3", "turtle", "nt", "pretty-xml", "trix", "trig", "nquads", "json-ld" ',
)
return False
else:
# RDF graph does not support 'ttl' format, only 'turtle' format. however, the preferred file extension is always '.ttl'
if self.rdf_format == "turtle":
self.extension = "ttl"
elif self.rdf_format == "ttl":
self.rdf_format = "turtle"
self.extension = "ttl"
else:
self.extension = self.rdf_format
return True
def _write_single_edge_list_to_file(
self,
edge_list: list,
label: str,
prop_dict: dict,
):
"""
This function takes one list of biocypher edges and writes them
to an RDF file with the given format.
Args:
edge_list (list): list of BioCypherEdges to be written
label (str): the label (type) of the edge
prop_dict (dict): properties of node class passed from parsing
function and their types
Returns:
bool: The return value. True for success, False otherwise.
"""
if not all(isinstance(n, BioCypherEdge) for n in edge_list):
logger.error("Edges must be passed as type BioCypherEdge.")
return False
# translate label to PascalCase
label_pascal = self.translator.name_sentence_to_pascal(label)
# create file name
file_name = os.path.join(
self.outdir, f"{label_pascal}.{self.extension}"
)
# write data in graph
graph = Graph()
self._init_namespaces(graph)
for edge in edge_list:
rdf_subject = edge.get_source_id()
rdf_object = edge.get_target_id()
rdf_predicate = edge.get_id()
rdf_properties = edge.get_properties()
if rdf_predicate == None:
rdf_predicate = rdf_subject + rdf_object
edge_label = self.translator.name_sentence_to_pascal(
edge.get_label()
)
edge_uri = self.rdf_namespaces["biocypher"][edge_label]
graph.add((edge_uri, RDF.type, RDFS.Class))
graph.add(
(
self.rdf_namespaces["biocypher"][rdf_predicate],
RDF.type,
edge_uri,
)
)
graph.add(
(
self.rdf_namespaces["biocypher"][rdf_predicate],
self.rdf_namespaces["biocypher"]["subject"],
self.subject_to_uri(rdf_subject),
)
)
graph.add(
(
self.rdf_namespaces["biocypher"][rdf_predicate],
self.rdf_namespaces["biocypher"]["object"],
self.subject_to_uri(rdf_object),
)
)
# add properties to the transformed edge --> node
for key, value in rdf_properties.items():
# only write value if it exists.
if value:
self.add_property_to_graph(graph, rdf_predicate, value, key)
graph.serialize(destination=file_name, format=self.rdf_format)
logger.info(
f"Writing {len(edge_list)} entries to {label_pascal}.{self.rdf_format}",
)
return True
def add_property_to_graph(
self,
graph: Graph,
rdf_subject: str,
rdf_object: str,
rdf_predicate: str,
):
"""
Function to add the properties to an RDF node. It takes the graph, the subject, object, and predicate of the RDF triple.
It checks if the property is a list and adds it to the graph accordingly. otherwise it checks if the string represents a list.
If it does, it transforms it to a list and adds it to the graph. if not, it adds the property to the graph as a literal.
If the property is neither a list or string, it will also be added as a literal.
Args:
graph (RDFLib.Graph): The RDF graph to add the nodes to.
rdf_subject (str): The subject of the RDF triple.
rdf_object (str): The object of the RDF triple.
rdf_predicate (str): The predicate of the RDF triple.
Returns:
None
"""
if isinstance(rdf_object, list):
for obj in rdf_object:
graph.add(
(
self.subject_to_uri(rdf_subject),
self.property_to_uri(rdf_predicate),
Literal(obj),
)
)
elif isinstance(rdf_object, str):
if rdf_object.startswith("[") and rdf_object.endswith("]"):
self.add_property_to_graph(
graph,
rdf_subject,
self.transform_string_to_list(rdf_object),
rdf_predicate,
)
else:
graph.add(
(
self.subject_to_uri(rdf_subject),
self.property_to_uri(rdf_predicate),
Literal(rdf_object),
)
)
else:
graph.add(
(
self.subject_to_uri(rdf_subject),
self.property_to_uri(rdf_predicate),
Literal(rdf_object),
)
)
def transform_string_to_list(self, string_list: str) -> list:
"""
Function to transform a string representation of a list into a list.
Args:
string_list (str): The string representation of the list.
Returns:
list: The list representation of the input string.
"""
return (
string_list.replace("[", "")
.replace("]", "")
.replace("'", "")
.split(", ")
)
def _write_single_node_list_to_file(
self,
node_list: list,
label: str,
prop_dict: dict,
labels: str,
):
"""
This function takes a list of BioCypherNodes and writes them
to an RDF file in the specified format.
Args:
node_list (list): A list of BioCypherNodes to be written.
label (str): The label (type) of the nodes.
prop_dict (dict): A dictionary of properties and their types for the node class.
Returns:
bool: True if the writing is successful, False otherwise.
"""
if not all(isinstance(n, BioCypherNode) for n in node_list):
logger.error("Nodes must be passed as type BioCypherNode.")
return False
# translate label to PascalCase
label_pascal = self.translator.name_sentence_to_pascal(label)
# create file name
file_name = os.path.join(
self.outdir, f"{label_pascal}.{self.extension}"
)
# write data in graph
graph = Graph()
self._init_namespaces(graph)
for n in node_list:
rdf_subject = n.get_id()
rdf_object = n.get_label()
properties = n.get_properties()
class_name = self.translator.name_sentence_to_pascal(rdf_object)
graph.add(
(
self.rdf_namespaces["biocypher"][class_name],
RDF.type,
RDFS.Class,
)
)
graph.add(
(
self.subject_to_uri(rdf_subject),
RDF.type,
self.rdf_namespaces["biocypher"][class_name],
)
)
for key, value in properties.items():
# only write value if it exists.
if value:
self.add_property_to_graph(graph, rdf_subject, value, key)
graph.serialize(destination=file_name, format=self.rdf_format)
logger.info(
f"Writing {len(node_list)} entries to {label_pascal}.{self.rdf_format}",
)
return True
def write_nodes(
self, nodes, batch_size: int = int(1e6), force: bool = False
) -> bool:
"""
Wrapper for writing nodes in RDF format. It calls the _write_node_data() function, specifying the node data.
Args:
nodes (list or generator): A list or generator of nodes in BioCypherNode format.
batch_size (int): The number of nodes to write in each batch.
force (bool): Flag to force the writing even if the output file already exists.
Returns:
bool: True if the writing is successful, False otherwise.
"""
# check if specified output format is correct
passed = self._is_rdf_format_supported(self.rdf_format)
if not passed:
logger.error("Error while writing node data, wrong RDF format")
return False
# write node data using _write_node_data method
passed = self._write_node_data(nodes, batch_size, force)
if not passed:
logger.error("Error while writing node data.")
return False
return True
def write_edges(
self,
edges: Union[list, GeneratorType],
batch_size: int = int(1e6),
) -> bool:
"""
Wrapper for writing edges in RDF format. It calls _write_edge_data()
functions specifying it's edge data.
Args:
edges (BioCypherEdge): a list or generator of edges in
:py:class:`BioCypherEdge` format
batch_size (int): The number of edges to write in each batch.
Returns:
bool: The return value. True for success, False otherwise.
"""
# check if specified output format is correct
passed = self._is_rdf_format_supported(self.rdf_format)
if not passed:
logger.error("Error while writing edge data, wrong RDF format")
return False
# write edge data using _write_edge_data method
passed = self._write_edge_data(edges, batch_size=batch_size)
if not passed:
logger.error("Error while writing edge data.")
return False
return True
def _construct_import_call(self) -> bool:
"""
Function to write the import call.
This function is not applicable for RDF.
Returns:
bool: The return value. True for success, False otherwise.
"""
return ""
def _write_array_string(self, string_list):
"""
Abstract method to write the string representation of an array into a .csv file
as required by the RDF admin-import.
This function is not applicable for RDF.
Args:
string_list (list): list of ontology strings
Returns:
str: The string representation of an array for the neo4j admin import
"""
return True
def _write_node_headers(self):
"""
Abstract method that takes care of importing properties of a graph entity that is represented
as a node as per the definition in the `schema_config.yaml`
This function is not applicable for RDF.
Returns:
bool: The return value. True for success, False otherwise.
"""
return True
def _write_edge_headers(self):
"""
Abstract method to write a database import-file for a graph entity that is represented
as an edge as per the definition in the `schema_config.yaml`,
containing only the header for this type of edge.
This function is not applicable for RDF.
Returns:
bool: The return value. True for success, False otherwise.
"""
return True
def subject_to_uri(self, subject: str) -> str:
"""
Converts the subject to a proper URI using the available namespaces.
If the conversion fails, it defaults to the biocypher prefix.
Args:
subject (str): The subject to be converted to a URI.
Returns:
str: The corresponding URI for the subject.
"""
try:
_pref, _id = subject.split(":")
if _pref in self.rdf_namespaces.keys():
return self.rdf_namespaces[_pref][_id]
else:
return self.rdf_namespaces["biocypher"][subject]
except ValueError:
return self.rdf_namespaces["biocypher"][subject]
def property_to_uri(self, property_name: str) -> dict[str, str]:
"""
Converts a property name to its corresponding URI.
This function takes a property name and searches for its corresponding URI in various namespaces.
It first checks the core namespaces for rdflib, including owl, rdf, rdfs, xsd, and xml.
Args:
property_name (str): The property name to be converted to a URI.
Returns:
str: The corresponding URI for the input property name.
"""
# These namespaces are core for rdflib; owl, rdf, rdfs, xsd and xml
for namespace in _NAMESPACE_PREFIXES_CORE.values():
if property_name in namespace:
return namespace[property_name]
# If the property name is not found in the core namespaces, search in the SKOS, DC, and DCTERMS namespaces
for namespace in [SKOS, DC, DCTERMS]:
if property_name in namespace:
return namespace[property_name]
# If the property name is still not found, try other namespaces from rdflib.
for namespace in _NAMESPACE_PREFIXES_RDFLIB.values():
if property_name in namespace:
return namespace[property_name]
# If the property name is "licence", it recursively calls the function with "license" as the input.
if property_name == "licence":
return self.property_to_uri("license")
# TODO: add an option to search trough manually implemented namespaces
# If the input is not found in any of the namespaces, it returns the corresponding URI from the biocypher namespace.
# TODO: give a warning and try to prevent this option altogether
return self.rdf_namespaces["biocypher"][property_name]
def _init_namespaces(self, graph: Graph):
"""
Initializes the namespaces for the RDF graph. These namespaces are used to convert nodes to URIs.
This function adds the biocypher standard namespace to the `rdf_namespaces` attribute of the class.
If `rdf_namespaces` is empty, it sets it to the biocypher standard namespace. Otherwise, it merges
the biocypher standard namespace with the namespaces defined in the biocypher_config.yaml.
Args:
graph (RDFLib.Graph): The RDF graph to bind the namespaces to.
Returns:
None
"""
# add biocypher standard to self.rdf_namespaces
biocypher_standard = {"biocypher": "https://biocypher.org/biocypher#"}
if not self.rdf_namespaces:
self.rdf_namespaces = biocypher_standard
else:
self.rdf_namespaces = self.rdf_namespaces | biocypher_standard
for key, value in self.rdf_namespaces.items():
namespace = Namespace(value)
self.rdf_namespaces[key] = namespace
graph.bind(key, namespace)

View File

@ -0,0 +1,76 @@
from more_itertools import peekable
from biocypher._logger import logger
from biocypher.output.write._writer import _Writer
from biocypher.output.in_memory._pandas import Pandas
class _PandasCSVWriter(_Writer):
"""
Class for writing node and edge representations to a CSV file.
"""
def __init__(self, *args, write_to_file: bool = True, **kwargs):
kwargs["write_to_file"] = write_to_file
super().__init__(*args, **kwargs)
self.in_memory_dfs = {}
self.stored_dfs = {}
self.pandas_in_memory = Pandas(
translator=self.translator,
deduplicator=self.deduplicator,
)
self.delimiter = kwargs.get("delimiter")
if not self.delimiter:
self.delimiter = ","
self.write_to_file = write_to_file
def _construct_import_call(self) -> str:
"""Function to construct the Python code to load all node and edge csv files again into Pandas dfs.
Returns:
str: Python code to load the csv files into Pandas dfs.
"""
import_call = "import pandas as pd\n\n"
for df_name in self.stored_dfs.keys():
import_call += f"{df_name} = pd.read_csv('./{df_name}.csv', header=0, index_col=0)\n"
return import_call
def _get_import_script_name(self) -> str:
"""Function to return the name of the import script."""
return "import_pandas_csv.py"
def _write_node_data(self, nodes) -> bool:
passed = self._write_entities_to_file(nodes)
return passed
def _write_edge_data(self, edges) -> bool:
passed = self._write_entities_to_file(edges)
return passed
def _write_entities_to_file(self, entities: iter) -> bool:
"""Function to output.write the entities to a CSV file.
Args:
entities (iterable): An iterable of BioCypherNode / BioCypherEdge / BioCypherRelAsNode objects.
"""
entities = peekable(entities)
entity_list = self.pandas_in_memory._separate_entity_types(entities)
for entity_type, entities in entity_list.items():
self.in_memory_dfs[
entity_type
] = self.pandas_in_memory._add_entity_df(entity_type, entities)
for entity_type in self.in_memory_dfs.keys():
entity_df = self.in_memory_dfs[entity_type]
if " " in entity_type or "." in entity_type:
entity_type = entity_type.replace(" ", "_").replace(".", "_")
if self.write_to_file:
logger.info(
f"Writing {entity_df.shape[0]} entries to {entity_type}.csv."
)
entity_df.to_csv(
f"{self.output_directory}/{entity_type}.csv",
sep=self.delimiter,
)
self.stored_dfs[entity_type] = entity_df
self.in_memory_dfs = {}
return True

View File

@ -0,0 +1,320 @@
import os
import glob
from biocypher._logger import logger
from biocypher.output.write._batch_writer import _BatchWriter
class _PostgreSQLBatchWriter(_BatchWriter):
"""
Class for writing node and edge representations to disk using the
format specified by PostgreSQL for the use of "COPY FROM...". Each batch
writer instance has a fixed representation that needs to be passed
at instantiation via the :py:attr:`schema` argument. The instance
also expects an ontology adapter via :py:attr:`ontology_adapter` to be able
to convert and extend the hierarchy.
This class inherits from the abstract class "_BatchWriter" and implements the
PostgreSQL-specific methods:
- _write_node_headers
- _write_edge_headers
- _construct_import_call
- _write_array_string
"""
DATA_TYPE_LOOKUP = {
"str": "VARCHAR", # VARCHAR needs limit
"int": "INTEGER",
"long": "BIGINT",
"float": "NUMERIC",
"double": "NUMERIC",
"dbl": "NUMERIC",
"boolean": "BOOLEAN",
"str[]": "VARCHAR[]",
"string[]": "VARCHAR[]",
}
def __init__(self, *args, **kwargs):
self._copy_from_csv_commands = set()
super().__init__(*args, **kwargs)
def _get_default_import_call_bin_prefix(self):
"""
Method to provide the default string for the import call bin prefix.
Returns:
str: The default location for the psql command
"""
return ""
def _get_data_type(self, string) -> str:
try:
return self.DATA_TYPE_LOOKUP[string]
except KeyError:
logger.info(
'Could not determine data type {string}. Using default "VARCHAR"'
)
return "VARCHAR"
def _write_array_string(self, string_list) -> str:
"""
Abstract method to output.write the string representation of an array into a .csv file
as required by the postgresql COPY command, with '{','}' brackets and ',' separation.
Args:
string_list (list): list of ontology strings
Returns:
str: The string representation of an array for postgres COPY
"""
string = ",".join(string_list)
string = f'"{{{string}}}"'
return string
def _get_import_script_name(self) -> str:
"""
Returns the name of the psql import script
Returns:
str: The name of the import script (ending in .sh)
"""
return f"{self.db_name}-import-call.sh"
def _adjust_pascal_to_psql(self, string):
string = string.replace(".", "_")
string = string.lower()
return string
def _write_node_headers(self):
"""
Writes single CSV file for a graph entity that is represented
as a node as per the definition in the `schema_config.yaml`,
containing only the header for this type of node.
Returns:
bool: The return value. True for success, False otherwise.
"""
# load headers from data parse
if not self.node_property_dict:
logger.error(
"Header information not found. Was the data parsed first?",
)
return False
for label, props in self.node_property_dict.items():
# create header CSV with ID, properties, labels
# translate label to PascalCase
pascal_label = self.translator.name_sentence_to_pascal(label)
parts = f"{pascal_label}-part*.csv"
parts_paths = os.path.join(self.outdir, parts)
parts_paths = glob.glob(parts_paths)
parts_paths.sort()
# adjust label for import to psql
pascal_label = self._adjust_pascal_to_psql(pascal_label)
table_create_command_path = os.path.join(
self.outdir,
f"{pascal_label}-create_table.sql",
)
# check if file already exists
if os.path.exists(table_create_command_path):
logger.warning(
f"File {table_create_command_path} already exists. Overwriting.",
)
# concatenate key:value in props
columns = ["_ID VARCHAR"]
for col_name, col_type in props.items():
col_type = self._get_data_type(col_type)
col_name = self._adjust_pascal_to_psql(col_name)
columns.append(f"{col_name} {col_type}")
columns.append("_LABEL VARCHAR[]")
with open(table_create_command_path, "w", encoding="utf-8") as f:
command = ""
if self.wipe:
command += f"DROP TABLE IF EXISTS {pascal_label};\n"
# table creation requires comma separation
command += (
f'CREATE TABLE {pascal_label}({",".join(columns)});\n'
)
f.write(command)
for parts_path in parts_paths:
# if import_call_file_prefix is set, replace actual path
# with prefix
if self.import_call_file_prefix != self.outdir:
parts_path = parts_path.replace(
self.outdir,
self.import_call_file_prefix,
)
self._copy_from_csv_commands.add(
f"\\copy {pascal_label} FROM '{parts_path}' DELIMITER E'{self.delim}' CSV;"
)
# add file path to import statement
# if import_call_file_prefix is set, replace actual path
# with prefix
if self.import_call_file_prefix != self.outdir:
table_create_command_path = table_create_command_path.replace(
self.outdir,
self.import_call_file_prefix,
)
self.import_call_nodes.add(table_create_command_path)
return True
def _write_edge_headers(self):
"""
Writes single CSV file for a graph entity that is represented
as an edge as per the definition in the `schema_config.yaml`,
containing only the header for this type of edge.
Returns:
bool: The return value. True for success, False otherwise.
"""
# load headers from data parse
if not self.edge_property_dict:
logger.error(
"Header information not found. Was the data parsed first?",
)
return False
for label, props in self.edge_property_dict.items():
# translate label to PascalCase
pascal_label = self.translator.name_sentence_to_pascal(label)
parts_paths = os.path.join(self.outdir, f"{pascal_label}-part*.csv")
parts_paths = glob.glob(parts_paths)
parts_paths.sort()
# adjust label for import to psql
pascal_label = self._adjust_pascal_to_psql(pascal_label)
table_create_command_path = os.path.join(
self.outdir,
f"{pascal_label}-create_table.sql",
)
# check for file exists
if os.path.exists(table_create_command_path):
logger.warning(
f"File {table_create_command_path} already exists. Overwriting.",
)
# concatenate key:value in props
columns = []
for col_name, col_type in props.items():
col_type = self._get_data_type(col_type)
col_name = self._adjust_pascal_to_psql(col_name)
if col_name == "_ID":
# should ideally never happen
raise ValueError(
"Column name '_ID' is reserved for internal use, "
"denoting the relationship ID. Please choose a "
"different name for your column."
)
columns.append(f"{col_name} {col_type}")
# create list of lists and flatten
# removes need for empty check of property list
out_list = [
"_START_ID VARCHAR",
"_ID VARCHAR",
*columns,
"_END_ID VARCHAR",
"_TYPE VARCHAR",
]
with open(table_create_command_path, "w", encoding="utf-8") as f:
command = ""
if self.wipe:
command += f"DROP TABLE IF EXISTS {pascal_label};\n"
# table creation requires comma separation
command += (
f'CREATE TABLE {pascal_label}({",".join(out_list)});\n'
)
f.write(command)
for parts_path in parts_paths:
# if import_call_file_prefix is set, replace actual path
# with prefix
if self.import_call_file_prefix != self.outdir:
parts_path = parts_path.replace(
self.outdir,
self.import_call_file_prefix,
)
self._copy_from_csv_commands.add(
f"\\copy {pascal_label} FROM '{parts_path}' DELIMITER E'{self.delim}' CSV;"
)
# add file path to import statement
# if import_call_file_prefix is set, replace actual path
# with prefix
if self.import_call_file_prefix != self.outdir:
table_create_command_path = table_create_command_path.replace(
self.outdir,
self.import_call_file_prefix,
)
self.import_call_edges.add(table_create_command_path)
return True
def _construct_import_call(self) -> str:
"""
Function to construct the import call detailing folder and
individual node and edge headers and data files, as well as
delimiters and database name. Built after all data has been
processed to ensure that nodes are called before any edges.
Returns:
str: a bash command for postgresql import
"""
import_call = ""
# create tables
# At this point, csv files of nodes and edges do not require differentiation
for import_file_path in [
*self.import_call_nodes,
*self.import_call_edges,
]:
import_call += f'echo "Setup {import_file_path}..."\n'
if {self.db_password}:
# set password variable inline
import_call += f"PGPASSWORD={self.db_password} "
import_call += (
f"{self.import_call_bin_prefix}psql -f {import_file_path}"
)
import_call += f" --dbname {self.db_name}"
import_call += f" --host {self.db_host}"
import_call += f" --port {self.db_port}"
import_call += f" --user {self.db_user}"
import_call += '\necho "Done!"\n'
import_call += "\n"
# copy data to tables
for command in self._copy_from_csv_commands:
table_part = command.split(" ")[3]
import_call += f'echo "Importing {table_part}..."\n'
if {self.db_password}:
# set password variable inline
import_call += f"PGPASSWORD={self.db_password} "
import_call += f'{self.import_call_bin_prefix}psql -c "{command}"'
import_call += f" --dbname {self.db_name}"
import_call += f" --host {self.db_host}"
import_call += f" --port {self.db_port}"
import_call += f" --user {self.db_user}"
import_call += '\necho "Done!"\n'
import_call += "\n"
return import_call

View File

@ -0,0 +1,51 @@
from biocypher.output.write.relational._postgresql import _PostgreSQLBatchWriter
class _SQLiteBatchWriter(_PostgreSQLBatchWriter):
"""
Class for writing node and edge representations to a SQLite database.
It uses the _PostgreSQLBatchWriter class under the hood, which already
implements the logic to write the nodes/edges to a relational DBMS.
Only the import bash script differs between PostgreSQL and SQLite
and is therefore implemented in this class.
- _construct_import_call
"""
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
def _construct_import_call(self) -> str:
"""
Function to construct the import call detailing folder and
individual node and edge headers and data files, as well as
delimiters and database name. Built after all data has been
processed to ensure that nodes are called before any edges.
Returns:
str: a bash command for sqlite import
"""
import_call = ""
# create tables
# At this point, csv files of nodes and edges do not require differentiation
for import_file_path in [
*self.import_call_nodes,
*self.import_call_edges,
]:
import_call += f'echo "Setup {import_file_path}..."\n'
import_call += f"{self.import_call_bin_prefix}sqlite3 {self.db_name} < {import_file_path}"
import_call += '\necho "Done!"\n'
import_call += "\n"
for command in self._copy_from_csv_commands:
table_name = command.split(" ")[1]
table_part = command.split(" ")[3].replace("'", "")
import_call += f'echo "Importing {table_part}..."\n'
separator = self.delim
import_part = f".import {table_part} {table_name}"
import_call += f"{self.import_call_bin_prefix}sqlite3 -separator $'{separator}' {self.db_name} \"{import_part}\""
import_call += '\necho "Done!"\n'
import_call += "\n"
return import_call