release commit
This commit is contained in:
0
biocypher/output/__init__.py
Normal file
0
biocypher/output/__init__.py
Normal file
0
biocypher/output/connect/__init__.py
Normal file
0
biocypher/output/connect/__init__.py
Normal file
422
biocypher/output/connect/_neo4j_driver.py
Normal file
422
biocypher/output/connect/_neo4j_driver.py
Normal file
@ -0,0 +1,422 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
#
|
||||
# Copyright 2021, Heidelberg University Clinic
|
||||
#
|
||||
# File author(s): Sebastian Lobentanzer
|
||||
# ...
|
||||
#
|
||||
# Distributed under MIT licence, see the file `LICENSE`.
|
||||
#
|
||||
"""
|
||||
BioCypher 'online' mode. Handles connection and manipulation of a running DBMS.
|
||||
"""
|
||||
import subprocess
|
||||
|
||||
from biocypher._logger import logger
|
||||
|
||||
logger.debug(f"Loading module {__name__}.")
|
||||
|
||||
from collections.abc import Iterable
|
||||
import itertools
|
||||
|
||||
import neo4j_utils
|
||||
|
||||
from biocypher import _misc
|
||||
from biocypher._config import config as _config
|
||||
from biocypher._create import BioCypherEdge, BioCypherNode
|
||||
from biocypher._translate import Translator
|
||||
|
||||
__all__ = ["_Neo4jDriver"]
|
||||
|
||||
|
||||
class _Neo4jDriver:
|
||||
"""
|
||||
Manages a BioCypher connection to a Neo4j database using the
|
||||
``neo4j_utils.Driver`` class.
|
||||
|
||||
Args:
|
||||
|
||||
database_name (str): The name of the database to connect to.
|
||||
|
||||
wipe (bool): Whether to wipe the database before importing.
|
||||
|
||||
uri (str): The URI of the database.
|
||||
|
||||
user (str): The username to use for authentication.
|
||||
|
||||
password (str): The password to use for authentication.
|
||||
|
||||
multi_db (bool): Whether to use multi-database mode.
|
||||
|
||||
fetch_size (int): The number of records to fetch at a time.
|
||||
|
||||
increment_version (bool): Whether to increment the version number.
|
||||
|
||||
translator (Translator): The translator to use for mapping.
|
||||
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
database_name: str,
|
||||
uri: str,
|
||||
user: str,
|
||||
password: str,
|
||||
multi_db: bool,
|
||||
translator: Translator,
|
||||
wipe: bool = False,
|
||||
fetch_size: int = 1000,
|
||||
increment_version: bool = True,
|
||||
):
|
||||
self.translator = translator
|
||||
|
||||
self._driver = neo4j_utils.Driver(
|
||||
db_name=database_name,
|
||||
db_uri=uri,
|
||||
db_user=user,
|
||||
db_passwd=password,
|
||||
fetch_size=fetch_size,
|
||||
wipe=wipe,
|
||||
multi_db=multi_db,
|
||||
raise_errors=True,
|
||||
)
|
||||
|
||||
# check for biocypher config in connected graph
|
||||
|
||||
if wipe:
|
||||
self.init_db()
|
||||
|
||||
if increment_version:
|
||||
# set new current version node
|
||||
self._update_meta_graph()
|
||||
|
||||
def _update_meta_graph(self):
|
||||
logger.info("Updating Neo4j meta graph.")
|
||||
|
||||
# find current version node
|
||||
db_version = self._driver.query(
|
||||
"MATCH (v:BioCypher) " "WHERE NOT (v)-[:PRECEDES]->() " "RETURN v",
|
||||
)
|
||||
# add version node
|
||||
self.add_biocypher_nodes(self.translator.ontology)
|
||||
|
||||
# connect version node to previous
|
||||
if db_version[0]:
|
||||
previous = db_version[0][0]
|
||||
previous_id = previous["v"]["id"]
|
||||
e_meta = BioCypherEdge(
|
||||
previous_id,
|
||||
self.translator.ontology.get_dict().get("node_id"),
|
||||
"PRECEDES",
|
||||
)
|
||||
self.add_biocypher_edges(e_meta)
|
||||
|
||||
def init_db(self):
|
||||
"""
|
||||
Used to initialise a property graph database by setting up new
|
||||
constraints. Wipe has been performed by the ``neo4j_utils.Driver``
|
||||
class` already.
|
||||
|
||||
Todo:
|
||||
- set up constraint creation interactively depending on the
|
||||
need of the database
|
||||
"""
|
||||
|
||||
logger.info("Initialising database.")
|
||||
self._create_constraints()
|
||||
|
||||
def _create_constraints(self):
|
||||
"""
|
||||
Creates constraints on node types in the graph. Used for
|
||||
initial setup.
|
||||
|
||||
Grabs leaves of the ``schema_config.yaml`` file and creates
|
||||
constraints on the id of all entities represented as nodes.
|
||||
"""
|
||||
|
||||
logger.info("Creating constraints for node types in config.")
|
||||
|
||||
major_neo4j_version = int(self._get_neo4j_version().split(".")[0])
|
||||
# get structure
|
||||
for leaf in self.translator.ontology.mapping.extended_schema.items():
|
||||
label = _misc.sentencecase_to_pascalcase(leaf[0], sep=r"\s\.")
|
||||
if leaf[1]["represented_as"] == "node":
|
||||
if major_neo4j_version >= 5:
|
||||
s = (
|
||||
f"CREATE CONSTRAINT `{label}_id` "
|
||||
f"IF NOT EXISTS FOR (n:`{label}`) "
|
||||
"REQUIRE n.id IS UNIQUE"
|
||||
)
|
||||
self._driver.query(s)
|
||||
else:
|
||||
s = (
|
||||
f"CREATE CONSTRAINT `{label}_id` "
|
||||
f"IF NOT EXISTS ON (n:`{label}`) "
|
||||
"ASSERT n.id IS UNIQUE"
|
||||
)
|
||||
self._driver.query(s)
|
||||
|
||||
def _get_neo4j_version(self):
|
||||
"""Get neo4j version."""
|
||||
try:
|
||||
neo4j_version = self._driver.query(
|
||||
"""
|
||||
CALL dbms.components()
|
||||
YIELD name, versions, edition
|
||||
UNWIND versions AS version
|
||||
RETURN version AS version
|
||||
""",
|
||||
)[0][0]["version"]
|
||||
return neo4j_version
|
||||
except Exception as e:
|
||||
logger.warning(
|
||||
f"Error detecting Neo4j version: {e} use default version 4.0.0."
|
||||
)
|
||||
return "4.0.0"
|
||||
|
||||
def add_nodes(self, id_type_tuples: Iterable[tuple]) -> tuple:
|
||||
"""
|
||||
Generic node adder method to add any kind of input to the graph via the
|
||||
:class:`biocypher.create.BioCypherNode` class. Employs translation
|
||||
functionality and calls the :meth:`add_biocypher_nodes()` method.
|
||||
|
||||
Args:
|
||||
id_type_tuples (iterable of 3-tuple): for each node to add to
|
||||
the biocypher graph, a 3-tuple with the following layout:
|
||||
first, the (unique if constrained) ID of the node; second, the
|
||||
type of the node, capitalised or PascalCase and in noun form
|
||||
(Neo4j primary label, eg `:Protein`); and third, a dictionary
|
||||
of arbitrary properties the node should possess (can be empty).
|
||||
|
||||
Returns:
|
||||
2-tuple: the query result of :meth:`add_biocypher_nodes()`
|
||||
- first entry: data
|
||||
- second entry: Neo4j summary.
|
||||
"""
|
||||
|
||||
bn = self.translator.translate_nodes(id_type_tuples)
|
||||
return self.add_biocypher_nodes(bn)
|
||||
|
||||
def add_edges(self, id_src_tar_type_tuples: Iterable[tuple]) -> tuple:
|
||||
"""
|
||||
Generic edge adder method to add any kind of input to the graph
|
||||
via the :class:`biocypher.create.BioCypherEdge` class. Employs
|
||||
translation functionality and calls the
|
||||
:meth:`add_biocypher_edges()` method.
|
||||
|
||||
Args:
|
||||
|
||||
id_src_tar_type_tuples (iterable of 5-tuple):
|
||||
|
||||
for each edge to add to the biocypher graph, a 5-tuple
|
||||
with the following layout: first, the optional unique ID
|
||||
of the interaction. This can be `None` if there is no
|
||||
systematic identifier (which for many interactions is
|
||||
the case). Second and third, the (unique if constrained)
|
||||
IDs of the source and target nodes of the relationship;
|
||||
fourth, the type of the relationship; and fifth, a
|
||||
dictionary of arbitrary properties the edge should
|
||||
possess (can be empty).
|
||||
|
||||
Returns:
|
||||
|
||||
2-tuple: the query result of :meth:`add_biocypher_edges()`
|
||||
|
||||
- first entry: data
|
||||
- second entry: Neo4j summary.
|
||||
"""
|
||||
|
||||
bn = self.translator.translate_edges(id_src_tar_type_tuples)
|
||||
return self.add_biocypher_edges(bn)
|
||||
|
||||
def add_biocypher_nodes(
|
||||
self,
|
||||
nodes: Iterable[BioCypherNode],
|
||||
explain: bool = False,
|
||||
profile: bool = False,
|
||||
) -> bool:
|
||||
"""
|
||||
Accepts a node type handoff class
|
||||
(:class:`biocypher.create.BioCypherNode`) with id,
|
||||
label, and a dict of properties (passing on the type of
|
||||
property, ie, ``int``, ``str``, ...).
|
||||
|
||||
The dict retrieved by the
|
||||
:meth:`biocypher.create.BioCypherNode.get_dict()` method is
|
||||
passed into Neo4j as a map of maps, explicitly encoding node id
|
||||
and label, and adding all other properties from the 'properties'
|
||||
key of the dict. The merge is performed via APOC, matching only
|
||||
on node id to prevent duplicates. The same properties are set on
|
||||
match and on create, irrespective of the actual event.
|
||||
|
||||
Args:
|
||||
nodes:
|
||||
An iterable of :class:`biocypher.create.BioCypherNode` objects.
|
||||
explain:
|
||||
Call ``EXPLAIN`` on the CYPHER query.
|
||||
profile:
|
||||
Do profiling on the CYPHER query.
|
||||
|
||||
Returns:
|
||||
True for success, False otherwise.
|
||||
"""
|
||||
|
||||
try:
|
||||
nodes = _misc.to_list(nodes)
|
||||
|
||||
entities = [node.get_dict() for node in nodes]
|
||||
|
||||
except AttributeError:
|
||||
msg = "Nodes must have a `get_dict` method."
|
||||
logger.error(msg)
|
||||
|
||||
raise ValueError(msg)
|
||||
|
||||
logger.info(f"Merging {len(entities)} nodes.")
|
||||
|
||||
entity_query = (
|
||||
"UNWIND $entities AS ent "
|
||||
"CALL apoc.merge.node([ent.node_label], "
|
||||
"{id: ent.node_id}, ent.properties, ent.properties) "
|
||||
"YIELD node "
|
||||
"RETURN node"
|
||||
)
|
||||
|
||||
method = "explain" if explain else "profile" if profile else "query"
|
||||
|
||||
result = getattr(self._driver, method)(
|
||||
entity_query,
|
||||
parameters={
|
||||
"entities": entities,
|
||||
},
|
||||
)
|
||||
|
||||
logger.info("Finished merging nodes.")
|
||||
|
||||
return result
|
||||
|
||||
def add_biocypher_edges(
|
||||
self,
|
||||
edges: Iterable[BioCypherEdge],
|
||||
explain: bool = False,
|
||||
profile: bool = False,
|
||||
) -> bool:
|
||||
"""
|
||||
Accepts an edge type handoff class
|
||||
(:class:`biocypher.create.BioCypherEdge`) with source
|
||||
and target ids, label, and a dict of properties (passing on the
|
||||
type of property, ie, int, string ...).
|
||||
|
||||
The individual edge is either passed as a singleton, in the case
|
||||
of representation as an edge in the graph, or as a 4-tuple, in
|
||||
the case of representation as a node (with two edges connecting
|
||||
to interaction partners).
|
||||
|
||||
The dict retrieved by the
|
||||
:meth:`biocypher.create.BioCypherEdge.get_dict()` method is
|
||||
passed into Neo4j as a map of maps, explicitly encoding source
|
||||
and target ids and the relationship label, and adding all edge
|
||||
properties from the 'properties' key of the dict. The merge is
|
||||
performed via APOC, matching only on source and target id to
|
||||
prevent duplicates. The same properties are set on match and on
|
||||
create, irrespective of the actual event.
|
||||
|
||||
Args:
|
||||
edges:
|
||||
An iterable of :class:`biocypher.create.BioCypherEdge` objects.
|
||||
explain:
|
||||
Call ``EXPLAIN`` on the CYPHER query.
|
||||
profile:
|
||||
Do profiling on the CYPHER query.
|
||||
|
||||
Returns:
|
||||
`True` for success, `False` otherwise.
|
||||
"""
|
||||
|
||||
edges = _misc.ensure_iterable(edges)
|
||||
edges = itertools.chain(*(_misc.ensure_iterable(i) for i in edges))
|
||||
|
||||
nodes = []
|
||||
rels = []
|
||||
|
||||
try:
|
||||
for e in edges:
|
||||
if hasattr(e, "get_node"):
|
||||
nodes.append(e.get_node())
|
||||
rels.append(e.get_source_edge().get_dict())
|
||||
rels.append(e.get_target_edge().get_dict())
|
||||
|
||||
else:
|
||||
rels.append(e.get_dict())
|
||||
|
||||
except AttributeError:
|
||||
msg = "Edges and nodes must have a `get_dict` method."
|
||||
logger.error(msg)
|
||||
|
||||
raise ValueError(msg)
|
||||
|
||||
self.add_biocypher_nodes(nodes)
|
||||
logger.info(f"Merging {len(rels)} edges.")
|
||||
|
||||
# cypher query
|
||||
|
||||
# merging only on the ids of the entities, passing the
|
||||
# properties on match and on create;
|
||||
# TODO add node labels?
|
||||
node_query = (
|
||||
"UNWIND $rels AS r "
|
||||
"MERGE (src {id: r.source_id}) "
|
||||
"MERGE (tar {id: r.target_id}) "
|
||||
)
|
||||
|
||||
self._driver.query(node_query, parameters={"rels": rels})
|
||||
|
||||
edge_query = (
|
||||
"UNWIND $rels AS r "
|
||||
"MATCH (src {id: r.source_id}) "
|
||||
"MATCH (tar {id: r.target_id}) "
|
||||
"WITH src, tar, r "
|
||||
"CALL apoc.merge.relationship"
|
||||
"(src, r.relationship_label, NULL, "
|
||||
"r.properties, tar, r.properties) "
|
||||
"YIELD rel "
|
||||
"RETURN rel"
|
||||
)
|
||||
|
||||
method = "explain" if explain else "profile" if profile else "query"
|
||||
|
||||
result = getattr(self._driver, method)(
|
||||
edge_query, parameters={"rels": rels}
|
||||
)
|
||||
|
||||
logger.info("Finished merging edges.")
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def get_driver(
|
||||
dbms: str,
|
||||
translator: "Translator",
|
||||
):
|
||||
"""
|
||||
Function to return the writer class.
|
||||
|
||||
Returns:
|
||||
class: the writer class
|
||||
"""
|
||||
|
||||
dbms_config = _config(dbms)
|
||||
|
||||
if dbms == "neo4j":
|
||||
return _Neo4jDriver(
|
||||
database_name=dbms_config["database_name"],
|
||||
wipe=dbms_config["wipe"],
|
||||
uri=dbms_config["uri"],
|
||||
user=dbms_config["user"],
|
||||
password=dbms_config["password"],
|
||||
multi_db=dbms_config["multi_db"],
|
||||
translator=translator,
|
||||
)
|
||||
|
||||
return None
|
0
biocypher/output/in_memory/__init__.py
Normal file
0
biocypher/output/in_memory/__init__.py
Normal file
90
biocypher/output/in_memory/_pandas.py
Normal file
90
biocypher/output/in_memory/_pandas.py
Normal file
@ -0,0 +1,90 @@
|
||||
import pandas as pd
|
||||
|
||||
from biocypher._create import BioCypherEdge, BioCypherNode, BioCypherRelAsNode
|
||||
|
||||
|
||||
class Pandas:
|
||||
def __init__(self, translator, deduplicator):
|
||||
self.translator = translator
|
||||
self.deduplicator = deduplicator
|
||||
|
||||
self.dfs = {}
|
||||
|
||||
def _separate_entity_types(self, entities):
|
||||
"""
|
||||
Given mixed iterable of BioCypher objects, separate them into lists by
|
||||
type. Also deduplicates using the `Deduplicator` instance.
|
||||
"""
|
||||
lists = {}
|
||||
for entity in entities:
|
||||
if (
|
||||
not isinstance(entity, BioCypherNode)
|
||||
and not isinstance(entity, BioCypherEdge)
|
||||
and not isinstance(entity, BioCypherRelAsNode)
|
||||
):
|
||||
raise TypeError(
|
||||
"Expected a BioCypherNode / BioCypherEdge / "
|
||||
f"BioCypherRelAsNode, got {type(entity)}."
|
||||
)
|
||||
|
||||
if isinstance(entity, BioCypherNode):
|
||||
seen = self.deduplicator.node_seen(entity)
|
||||
elif isinstance(entity, BioCypherEdge):
|
||||
seen = self.deduplicator.edge_seen(entity)
|
||||
elif isinstance(entity, BioCypherRelAsNode):
|
||||
seen = self.deduplicator.rel_as_node_seen(entity)
|
||||
|
||||
if seen:
|
||||
continue
|
||||
|
||||
if isinstance(entity, BioCypherRelAsNode):
|
||||
node = entity.get_node()
|
||||
source_edge = entity.get_source_edge()
|
||||
target_edge = entity.get_target_edge()
|
||||
|
||||
_type = node.get_type()
|
||||
if not _type in lists:
|
||||
lists[_type] = []
|
||||
lists[_type].append(node)
|
||||
|
||||
_source_type = source_edge.get_type()
|
||||
if not _source_type in lists:
|
||||
lists[_source_type] = []
|
||||
lists[_source_type].append(source_edge)
|
||||
|
||||
_target_type = target_edge.get_type()
|
||||
if not _target_type in lists:
|
||||
lists[_target_type] = []
|
||||
lists[_target_type].append(target_edge)
|
||||
continue
|
||||
|
||||
_type = entity.get_type()
|
||||
if not _type in lists:
|
||||
lists[_type] = []
|
||||
lists[_type].append(entity)
|
||||
|
||||
return lists
|
||||
|
||||
def add_tables(self, entities):
|
||||
"""
|
||||
Add Pandas dataframes for each node and edge type in the input.
|
||||
"""
|
||||
|
||||
lists = self._separate_entity_types(entities)
|
||||
|
||||
for _type, _entities in lists.items():
|
||||
self._add_entity_df(_type, _entities)
|
||||
|
||||
def _add_entity_df(self, _type, _entities):
|
||||
df = pd.DataFrame(
|
||||
pd.json_normalize([node.get_dict() for node in _entities])
|
||||
)
|
||||
# replace "properties." with "" in column names
|
||||
df.columns = [col.replace("properties.", "") for col in df.columns]
|
||||
if _type not in self.dfs:
|
||||
self.dfs[_type] = df
|
||||
else:
|
||||
self.dfs[_type] = pd.concat(
|
||||
[self.dfs[_type], df], ignore_index=True
|
||||
)
|
||||
return self.dfs[_type]
|
0
biocypher/output/write/__init__.py
Normal file
0
biocypher/output/write/__init__.py
Normal file
1046
biocypher/output/write/_batch_writer.py
Normal file
1046
biocypher/output/write/_batch_writer.py
Normal file
File diff suppressed because it is too large
Load Diff
113
biocypher/output/write/_get_writer.py
Normal file
113
biocypher/output/write/_get_writer.py
Normal file
@ -0,0 +1,113 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
#
|
||||
# Copyright 2021, Heidelberg University Clinic
|
||||
#
|
||||
# File author(s): Sebastian Lobentanzer
|
||||
# Michael Hartung
|
||||
#
|
||||
# Distributed under MIT licence, see the file `LICENSE`.
|
||||
#
|
||||
"""
|
||||
BioCypher 'offline' module. Handles the writing of node and edge representations
|
||||
suitable for import into a DBMS.
|
||||
"""
|
||||
|
||||
from biocypher._logger import logger
|
||||
from biocypher.output.write.graph._rdf import _RDFWriter
|
||||
from biocypher.output.write.graph._neo4j import _Neo4jBatchWriter
|
||||
from biocypher.output.write.graph._arangodb import _ArangoDBBatchWriter
|
||||
from biocypher.output.write.graph._networkx import _NetworkXWriter
|
||||
from biocypher.output.write.relational._csv import _PandasCSVWriter
|
||||
from biocypher.output.write.relational._sqlite import _SQLiteBatchWriter
|
||||
from biocypher.output.write.relational._postgresql import _PostgreSQLBatchWriter
|
||||
|
||||
logger.debug(f"Loading module {__name__}.")
|
||||
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from biocypher._config import config as _config
|
||||
|
||||
__all__ = ["get_writer", "DBMS_TO_CLASS"]
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from biocypher._translate import Translator
|
||||
from biocypher._deduplicate import Deduplicator
|
||||
|
||||
DBMS_TO_CLASS = {
|
||||
"neo": _Neo4jBatchWriter,
|
||||
"neo4j": _Neo4jBatchWriter,
|
||||
"Neo4j": _Neo4jBatchWriter,
|
||||
"postgres": _PostgreSQLBatchWriter,
|
||||
"postgresql": _PostgreSQLBatchWriter,
|
||||
"PostgreSQL": _PostgreSQLBatchWriter,
|
||||
"arango": _ArangoDBBatchWriter,
|
||||
"arangodb": _ArangoDBBatchWriter,
|
||||
"ArangoDB": _ArangoDBBatchWriter,
|
||||
"sqlite": _SQLiteBatchWriter,
|
||||
"sqlite3": _SQLiteBatchWriter,
|
||||
"rdf": _RDFWriter,
|
||||
"RDF": _RDFWriter,
|
||||
"csv": _PandasCSVWriter,
|
||||
"CSV": _PandasCSVWriter,
|
||||
"pandas": _PandasCSVWriter,
|
||||
"Pandas": _PandasCSVWriter,
|
||||
"networkx": _NetworkXWriter,
|
||||
"NetworkX": _NetworkXWriter,
|
||||
}
|
||||
|
||||
|
||||
def get_writer(
|
||||
dbms: str,
|
||||
translator: "Translator",
|
||||
deduplicator: "Deduplicator",
|
||||
output_directory: str,
|
||||
strict_mode: bool,
|
||||
):
|
||||
"""
|
||||
Function to return the writer class based on the selection in the config
|
||||
file.
|
||||
|
||||
Args:
|
||||
dbms: the database management system; for options, see DBMS_TO_CLASS.
|
||||
translator: the Translator object.
|
||||
deduplicator: the Deduplicator object.
|
||||
output_directory: the directory to output.write the output files to.
|
||||
strict_mode: whether to use strict mode.
|
||||
|
||||
Returns:
|
||||
instance: an instance of the selected writer class.
|
||||
"""
|
||||
|
||||
dbms_config = _config(dbms)
|
||||
|
||||
writer = DBMS_TO_CLASS[dbms]
|
||||
|
||||
if not writer:
|
||||
raise ValueError(f"Unknown dbms: {dbms}")
|
||||
|
||||
if writer is not None:
|
||||
return writer(
|
||||
translator=translator,
|
||||
deduplicator=deduplicator,
|
||||
delimiter=dbms_config.get("delimiter"),
|
||||
array_delimiter=dbms_config.get("array_delimiter"),
|
||||
quote=dbms_config.get("quote_character"),
|
||||
output_directory=output_directory,
|
||||
db_name=dbms_config.get("database_name"),
|
||||
import_call_bin_prefix=dbms_config.get("import_call_bin_prefix"),
|
||||
import_call_file_prefix=dbms_config.get("import_call_file_prefix"),
|
||||
wipe=dbms_config.get("wipe"),
|
||||
strict_mode=strict_mode,
|
||||
skip_bad_relationships=dbms_config.get(
|
||||
"skip_bad_relationships"
|
||||
), # neo4j
|
||||
skip_duplicate_nodes=dbms_config.get(
|
||||
"skip_duplicate_nodes"
|
||||
), # neo4j
|
||||
db_user=dbms_config.get("user"), # psql
|
||||
db_password=dbms_config.get("password"), # psql
|
||||
db_port=dbms_config.get("port"), # psql
|
||||
rdf_format=dbms_config.get("rdf_format"), # rdf
|
||||
rdf_namespaces=dbms_config.get("rdf_namespaces"), # rdf
|
||||
)
|
200
biocypher/output/write/_writer.py
Normal file
200
biocypher/output/write/_writer.py
Normal file
@ -0,0 +1,200 @@
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Union, Optional
|
||||
from collections.abc import Iterable
|
||||
import os
|
||||
|
||||
from biocypher._create import BioCypherEdge, BioCypherNode, BioCypherRelAsNode
|
||||
from biocypher._logger import logger
|
||||
from biocypher._translate import Translator
|
||||
from biocypher._deduplicate import Deduplicator
|
||||
|
||||
__all__ = ["_Writer"]
|
||||
|
||||
|
||||
class _Writer(ABC):
|
||||
"""Abstract class for writing node and edge representations to disk.
|
||||
Specifics of the different writers (e.g. neo4j, postgresql, csv, etc.)
|
||||
are implemented in the child classes. Any concrete writer needs to
|
||||
implement at least:
|
||||
- _write_node_data
|
||||
- _write_edge_data
|
||||
- _construct_import_call
|
||||
- _get_import_script_name
|
||||
|
||||
Args:
|
||||
translator (Translator): Instance of :py:class:`Translator` to enable translation of
|
||||
nodes and manipulation of properties.
|
||||
deduplicator (Deduplicator): Instance of :py:class:`Deduplicator` to enable deduplication
|
||||
of nodes and edges.
|
||||
output_directory (str, optional): Path for exporting CSV files. Defaults to None.
|
||||
strict_mode (bool, optional): Whether to enforce source, version, and license properties. Defaults to False.
|
||||
strict_mode (bool, optional): Whether to enforce source, version, and license properties. Defaults to False.
|
||||
|
||||
Raises:
|
||||
NotImplementedError: Writer implementation must override '_write_node_data'
|
||||
NotImplementedError: Writer implementation must override '_write_edge_data'
|
||||
NotImplementedError: Writer implementation must override '_construct_import_call'
|
||||
NotImplementedError: Writer implementation must override '_get_import_script_name'
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
translator: Translator,
|
||||
deduplicator: Deduplicator,
|
||||
output_directory: Optional[str] = None,
|
||||
strict_mode: bool = False,
|
||||
*args,
|
||||
**kwargs,
|
||||
):
|
||||
"""Abstract class for writing node and edge representations to disk.
|
||||
|
||||
Args:
|
||||
translator (Translator): Instance of :py:class:`Translator` to enable translation of
|
||||
nodes and manipulation of properties.
|
||||
deduplicator (Deduplicator): Instance of :py:class:`Deduplicator` to enable deduplication
|
||||
of nodes and edges.
|
||||
output_directory (str, optional): Path for exporting CSV files. Defaults to None.
|
||||
strict_mode (bool, optional): Whether to enforce source, version, and license properties. Defaults to False.
|
||||
strict_mode (bool, optional): Whether to enforce source, version, and license properties. Defaults to False.
|
||||
"""
|
||||
self.translator = translator
|
||||
self.deduplicator = deduplicator
|
||||
self.strict_mode = strict_mode
|
||||
self.output_directory = output_directory
|
||||
|
||||
if os.path.exists(self.output_directory):
|
||||
if kwargs.get("write_to_file", True):
|
||||
logger.warning(
|
||||
f"Output directory `{self.output_directory}` already exists. "
|
||||
"If this is not planned, file consistency may be compromised."
|
||||
)
|
||||
else:
|
||||
logger.info(f"Creating output directory `{self.output_directory}`.")
|
||||
os.makedirs(self.output_directory)
|
||||
|
||||
@abstractmethod
|
||||
def _write_node_data(
|
||||
self,
|
||||
nodes: Iterable[
|
||||
Union[BioCypherNode, BioCypherEdge, BioCypherRelAsNode]
|
||||
],
|
||||
) -> bool:
|
||||
"""Implement how to output.write nodes to disk.
|
||||
|
||||
Args:
|
||||
nodes (Iterable): An iterable of BioCypherNode / BioCypherEdge / BioCypherRelAsNode objects.
|
||||
|
||||
Returns:
|
||||
bool: The return value. True for success, False otherwise.
|
||||
"""
|
||||
raise NotImplementedError(
|
||||
"Writer implementation must override 'write_nodes'"
|
||||
)
|
||||
|
||||
@abstractmethod
|
||||
def _write_edge_data(
|
||||
self,
|
||||
edges: Iterable[
|
||||
Union[BioCypherNode, BioCypherEdge, BioCypherRelAsNode]
|
||||
],
|
||||
) -> bool:
|
||||
"""Implement how to output.write edges to disk.
|
||||
|
||||
Args:
|
||||
edges (Iterable): An iterable of BioCypherNode / BioCypherEdge / BioCypherRelAsNode objects.
|
||||
|
||||
Returns:
|
||||
bool: The return value. True for success, False otherwise.
|
||||
"""
|
||||
raise NotImplementedError(
|
||||
"Writer implementation must override 'write_edges'"
|
||||
)
|
||||
|
||||
@abstractmethod
|
||||
def _construct_import_call(self) -> str:
|
||||
"""
|
||||
Function to construct the import call detailing folder and
|
||||
individual node and edge headers and data files, as well as
|
||||
delimiters and database name. Built after all data has been
|
||||
processed to ensure that nodes are called before any edges.
|
||||
|
||||
Returns:
|
||||
str: command for importing the output files into a DBMS.
|
||||
"""
|
||||
raise NotImplementedError(
|
||||
"Writer implementation must override '_construct_import_call'"
|
||||
)
|
||||
|
||||
@abstractmethod
|
||||
def _get_import_script_name(self) -> str:
|
||||
"""Returns the name of the import script.
|
||||
|
||||
Returns:
|
||||
str: The name of the import script (ending in .sh)
|
||||
"""
|
||||
raise NotImplementedError(
|
||||
"Writer implementation must override '_get_import_script_name'"
|
||||
)
|
||||
|
||||
def write_nodes(
|
||||
self, nodes, batch_size: int = int(1e6), force: bool = False
|
||||
):
|
||||
"""Wrapper for writing nodes.
|
||||
|
||||
Args:
|
||||
nodes (BioCypherNode): a list or generator of nodes in
|
||||
:py:class:`BioCypherNode` format
|
||||
batch_size (int): The batch size for writing nodes.
|
||||
force (bool): Whether to force writing nodes even if their type is
|
||||
not present in the schema.
|
||||
|
||||
Returns:
|
||||
bool: The return value. True for success, False otherwise.
|
||||
"""
|
||||
passed = self._write_node_data(nodes)
|
||||
if not passed:
|
||||
logger.error("Error while writing node data.")
|
||||
return False
|
||||
return True
|
||||
|
||||
def write_edges(
|
||||
self, edges, batch_size: int = int(1e6), force: bool = False
|
||||
):
|
||||
"""Wrapper for writing edges.
|
||||
|
||||
Args:
|
||||
nodes (BioCypherNode): a list or generator of nodes in
|
||||
:py:class:`BioCypherNode` format
|
||||
batch_size (int): The batch size for writing nodes.
|
||||
force (bool): Whether to force writing nodes even if their type is
|
||||
not present in the schema.
|
||||
|
||||
Returns:
|
||||
bool: The return value. True for success, False otherwise.
|
||||
"""
|
||||
passed = self._write_edge_data(edges)
|
||||
if not passed:
|
||||
logger.error("Error while writing edge data.")
|
||||
return False
|
||||
return True
|
||||
|
||||
def write_import_call(self):
|
||||
"""
|
||||
Function to output.write the import call detailing folder and
|
||||
individual node and edge headers and data files, as well as
|
||||
delimiters and database name, to the export folder as txt.
|
||||
|
||||
Returns:
|
||||
str: The path of the file holding the import call.
|
||||
"""
|
||||
file_path = os.path.join(
|
||||
self.output_directory, self._get_import_script_name()
|
||||
)
|
||||
logger.info(
|
||||
f"Writing {self.__class__.__name__} import call to `{file_path}`."
|
||||
)
|
||||
|
||||
with open(file_path, "w", encoding="utf-8") as f:
|
||||
f.write(self._construct_import_call())
|
||||
|
||||
return file_path
|
0
biocypher/output/write/graph/__init__.py
Normal file
0
biocypher/output/write/graph/__init__.py
Normal file
241
biocypher/output/write/graph/_arangodb.py
Normal file
241
biocypher/output/write/graph/_arangodb.py
Normal file
@ -0,0 +1,241 @@
|
||||
import os
|
||||
|
||||
from biocypher._logger import logger
|
||||
from biocypher.output.write.graph._neo4j import _Neo4jBatchWriter
|
||||
|
||||
|
||||
class _ArangoDBBatchWriter(_Neo4jBatchWriter):
|
||||
"""
|
||||
Class for writing node and edge representations to disk using the format
|
||||
specified by ArangoDB for the use of "arangoimport". Output files are
|
||||
similar to Neo4j, but with a different header format.
|
||||
"""
|
||||
|
||||
def _get_default_import_call_bin_prefix(self):
|
||||
"""
|
||||
Method to provide the default string for the import call bin prefix.
|
||||
|
||||
Returns:
|
||||
str: The default location for the neo4j admin import location
|
||||
"""
|
||||
return ""
|
||||
|
||||
def _get_import_script_name(self) -> str:
|
||||
"""
|
||||
Returns the name of the neo4j admin import script
|
||||
|
||||
Returns:
|
||||
str: The name of the import script (ending in .sh)
|
||||
"""
|
||||
return "arangodb-import-call.sh"
|
||||
|
||||
def _write_node_headers(self):
|
||||
"""
|
||||
Writes single CSV file for a graph entity that is represented
|
||||
as a node as per the definition in the `schema_config.yaml`,
|
||||
containing only the header for this type of node.
|
||||
|
||||
Returns:
|
||||
bool: The return value. True for success, False otherwise.
|
||||
"""
|
||||
# load headers from data parse
|
||||
if not self.node_property_dict:
|
||||
logger.error(
|
||||
"Header information not found. Was the data parsed first?",
|
||||
)
|
||||
return False
|
||||
|
||||
for label, props in self.node_property_dict.items():
|
||||
# create header CSV with ID, properties, labels
|
||||
|
||||
_id = "_key"
|
||||
|
||||
# translate label to PascalCase
|
||||
pascal_label = self.translator.name_sentence_to_pascal(label)
|
||||
|
||||
header = f"{pascal_label}-header.csv"
|
||||
header_path = os.path.join(
|
||||
self.outdir,
|
||||
header,
|
||||
)
|
||||
|
||||
# check if file already exists
|
||||
if os.path.exists(header_path):
|
||||
logger.warning(
|
||||
f"File {header_path} already exists. Overwriting."
|
||||
)
|
||||
|
||||
# concatenate key:value in props
|
||||
props_list = []
|
||||
for k in props.keys():
|
||||
props_list.append(f"{k}")
|
||||
|
||||
# create list of lists and flatten
|
||||
# removes need for empty check of property list
|
||||
out_list = [[_id], props_list]
|
||||
out_list = [val for sublist in out_list for val in sublist]
|
||||
|
||||
with open(header_path, "w", encoding="utf-8") as f:
|
||||
# concatenate with delimiter
|
||||
row = self.delim.join(out_list)
|
||||
f.write(row)
|
||||
|
||||
# add collection from schema config
|
||||
collection = self.translator.ontology.mapping.extended_schema[
|
||||
label
|
||||
].get("db_collection_name", None)
|
||||
|
||||
# add file path to neo4 admin import statement
|
||||
# do once for each part file
|
||||
parts = self.parts.get(label, [])
|
||||
|
||||
if not parts:
|
||||
raise ValueError(
|
||||
f"No parts found for node label {label}. "
|
||||
f"Check that the data was parsed first.",
|
||||
)
|
||||
|
||||
for part in parts:
|
||||
import_call_header_path = os.path.join(
|
||||
self.import_call_file_prefix,
|
||||
header,
|
||||
)
|
||||
import_call_parts_path = os.path.join(
|
||||
self.import_call_file_prefix,
|
||||
part,
|
||||
)
|
||||
|
||||
self.import_call_nodes.add(
|
||||
(
|
||||
import_call_header_path,
|
||||
import_call_parts_path,
|
||||
collection,
|
||||
)
|
||||
)
|
||||
|
||||
return True
|
||||
|
||||
def _write_edge_headers(self):
|
||||
"""
|
||||
Writes single CSV file for a graph entity that is represented
|
||||
as an edge as per the definition in the `schema_config.yaml`,
|
||||
containing only the header for this type of edge.
|
||||
|
||||
Returns:
|
||||
bool: The return value. True for success, False otherwise.
|
||||
"""
|
||||
# load headers from data parse
|
||||
if not self.edge_property_dict:
|
||||
logger.error(
|
||||
"Header information not found. Was the data parsed first?",
|
||||
)
|
||||
return False
|
||||
|
||||
for label, props in self.edge_property_dict.items():
|
||||
# translate label to PascalCase
|
||||
pascal_label = self.translator.name_sentence_to_pascal(label)
|
||||
|
||||
# paths
|
||||
header = f"{pascal_label}-header.csv"
|
||||
header_path = os.path.join(
|
||||
self.outdir,
|
||||
header,
|
||||
)
|
||||
parts = f"{pascal_label}-part.*"
|
||||
|
||||
# check for file exists
|
||||
if os.path.exists(header_path):
|
||||
logger.warning(
|
||||
f"Header file {header_path} already exists. Overwriting."
|
||||
)
|
||||
|
||||
# concatenate key:value in props
|
||||
props_list = []
|
||||
for k in props.keys():
|
||||
props_list.append(f"{k}")
|
||||
|
||||
out_list = ["_from", "_key", *props_list, "_to"]
|
||||
|
||||
with open(header_path, "w", encoding="utf-8") as f:
|
||||
# concatenate with delimiter
|
||||
row = self.delim.join(out_list)
|
||||
f.write(row)
|
||||
|
||||
# add collection from schema config
|
||||
if not self.translator.ontology.mapping.extended_schema.get(label):
|
||||
for (
|
||||
_,
|
||||
v,
|
||||
) in self.translator.ontology.mapping.extended_schema.items():
|
||||
if v.get("label_as_edge") == label:
|
||||
collection = v.get("db_collection_name", None)
|
||||
break
|
||||
|
||||
else:
|
||||
collection = self.translator.ontology.mapping.extended_schema[
|
||||
label
|
||||
].get("db_collection_name", None)
|
||||
|
||||
# add file path to neo4 admin import statement (import call path
|
||||
# may be different from actual output path)
|
||||
header_import_call_path = os.path.join(
|
||||
self.import_call_file_prefix,
|
||||
header,
|
||||
)
|
||||
parts_import_call_path = os.path.join(
|
||||
self.import_call_file_prefix,
|
||||
parts,
|
||||
)
|
||||
self.import_call_edges.add(
|
||||
(
|
||||
header_import_call_path,
|
||||
parts_import_call_path,
|
||||
collection,
|
||||
)
|
||||
)
|
||||
|
||||
return True
|
||||
|
||||
def _construct_import_call(self) -> str:
|
||||
"""
|
||||
Function to construct the import call detailing folder and
|
||||
individual node and edge headers and data files, as well as
|
||||
delimiters and database name. Built after all data has been
|
||||
processed to ensure that nodes are called before any edges.
|
||||
|
||||
Returns:
|
||||
str: a bash command for neo4j-admin import
|
||||
"""
|
||||
import_call = (
|
||||
f"{self.import_call_bin_prefix}arangoimp "
|
||||
f"--type csv "
|
||||
f'--separator="{self.escaped_delim}" '
|
||||
)
|
||||
|
||||
if self.quote == "'":
|
||||
import_call += f'--quote="{self.quote}" '
|
||||
else:
|
||||
import_call += f"--quote='{self.quote}' "
|
||||
|
||||
node_lines = ""
|
||||
|
||||
# node import calls: one line per node type
|
||||
for header_path, parts_path, collection in self.import_call_nodes:
|
||||
line = (
|
||||
f"{import_call} "
|
||||
f"--headers-file {header_path} "
|
||||
f"--file= {parts_path} "
|
||||
)
|
||||
|
||||
if collection:
|
||||
line += f"--create-collection --collection {collection} "
|
||||
|
||||
node_lines += f"{line}\n"
|
||||
|
||||
edge_lines = ""
|
||||
|
||||
# edge import calls: one line per edge type
|
||||
for header_path, parts_path, collection in self.import_call_edges:
|
||||
import_call += f'--relationships="{header_path},{parts_path}" '
|
||||
|
||||
return node_lines + edge_lines
|
502
biocypher/output/write/graph/_neo4j.py
Normal file
502
biocypher/output/write/graph/_neo4j.py
Normal file
@ -0,0 +1,502 @@
|
||||
import os
|
||||
import glob
|
||||
import pandas as pd
|
||||
|
||||
from biocypher._logger import logger
|
||||
from biocypher.output.write._batch_writer import parse_label, _BatchWriter
|
||||
|
||||
|
||||
class _Neo4jBatchWriter(_BatchWriter):
|
||||
"""
|
||||
Class for writing node and edge representations to disk using the
|
||||
format specified by Neo4j for the use of admin import. Each batch
|
||||
writer instance has a fixed representation that needs to be passed
|
||||
at instantiation via the :py:attr:`schema` argument. The instance
|
||||
also expects an ontology adapter via :py:attr:`ontology_adapter` to be able
|
||||
to convert and extend the hierarchy.
|
||||
|
||||
This class inherits from the abstract class "_BatchWriter" and implements the
|
||||
Neo4j-specific methods:
|
||||
|
||||
- _write_node_headers
|
||||
- _write_edge_headers
|
||||
- _construct_import_call
|
||||
- _write_array_string
|
||||
"""
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
"""
|
||||
Constructor.
|
||||
|
||||
Check the version of Neo4j and adds a command scope if version >= 5.
|
||||
|
||||
Returns:
|
||||
_Neo4jBatchWriter: An instance of the writer.
|
||||
"""
|
||||
|
||||
# Should read the configuration and setup import_call_bin_prefix.
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
def _get_default_import_call_bin_prefix(self):
|
||||
"""
|
||||
Method to provide the default string for the import call bin prefix.
|
||||
|
||||
Returns:
|
||||
str: The default location for the neo4j admin import location
|
||||
"""
|
||||
|
||||
return "bin/"
|
||||
|
||||
def _write_array_string(self, string_list):
|
||||
"""
|
||||
Abstract method to output.write the string representation of an array into a .csv file
|
||||
as required by the neo4j admin-import.
|
||||
|
||||
Args:
|
||||
string_list (list): list of ontology strings
|
||||
|
||||
Returns:
|
||||
str: The string representation of an array for the neo4j admin import
|
||||
"""
|
||||
string = self.adelim.join(string_list)
|
||||
return f"{self.quote}{string}{self.quote}"
|
||||
|
||||
def _write_node_headers(self):
|
||||
"""
|
||||
Writes single CSV file for a graph entity that is represented
|
||||
as a node as per the definition in the `schema_config.yaml`,
|
||||
containing only the header for this type of node.
|
||||
|
||||
Returns:
|
||||
bool: The return value. True for success, False otherwise.
|
||||
"""
|
||||
# load headers from data parse
|
||||
if not self.node_property_dict:
|
||||
logger.error(
|
||||
"Header information not found. Was the data parsed first?",
|
||||
)
|
||||
return False
|
||||
|
||||
for label, props in self.node_property_dict.items():
|
||||
_id = ":ID"
|
||||
|
||||
##MeDaX dev remark:
|
||||
##From Fhir data we get case sensitive labels. E.g. 'Procedure' and 'procedure' are two distinct node types.
|
||||
##Because we are converting Resources to more specific node classes using their "resourceType" attribute.
|
||||
|
||||
# translate label to PascalCase
|
||||
pascal_label = self.translator.name_sentence_to_pascal(
|
||||
parse_label(label)
|
||||
)
|
||||
|
||||
header = f"{pascal_label}-header.csv"
|
||||
header_path = os.path.join(
|
||||
self.outdir,
|
||||
header,
|
||||
)
|
||||
parts = f"{pascal_label}-part.*"
|
||||
|
||||
existing_header = False
|
||||
# check if file already exists
|
||||
if os.path.exists(header_path):
|
||||
logger.warning(
|
||||
f"Header file `{header_path}` already exists. Overwriting.",
|
||||
)
|
||||
with open(header_path, "r", encoding="utf-8") as existing:
|
||||
existing_header = existing.read().strip().split(self.delim)
|
||||
|
||||
# concatenate key:value in props
|
||||
props_list = []
|
||||
for k, v in props.items():
|
||||
if v in ["int", "long", "integer"]:
|
||||
props_list.append(f"{k}:long")
|
||||
elif v in ["int[]", "long[]", "integer[]"]:
|
||||
props_list.append(f"{k}:long[]")
|
||||
elif v in ["float", "double", "dbl"]:
|
||||
props_list.append(f"{k}:double")
|
||||
elif v in ["float[]", "double[]"]:
|
||||
props_list.append(f"{k}:double[]")
|
||||
elif v in ["bool", "boolean"]:
|
||||
# TODO Neo4j boolean support / spelling?
|
||||
props_list.append(f"{k}:boolean")
|
||||
elif v in ["bool[]", "boolean[]"]:
|
||||
props_list.append(f"{k}:boolean[]")
|
||||
elif v in ["str[]", "string[]"]:
|
||||
props_list.append(f"{k}:string[]")
|
||||
else:
|
||||
props_list.append(f"{k}")
|
||||
|
||||
# create list of lists and flatten
|
||||
out_list = [[_id], props_list, [":LABEL"]]
|
||||
out_list = [val for sublist in out_list for val in sublist]
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
with open(header_path, "w", encoding="utf-8") as f:
|
||||
# Check if header file already exists and has different columns
|
||||
if os.path.exists(header_path):
|
||||
if existing_header:
|
||||
#existing_header = existing.read().strip().split(self.delim)
|
||||
# Compare existing and new headers
|
||||
if set(existing_header) != set(out_list):
|
||||
|
||||
# Get part files associated with this header
|
||||
base_name = os.path.basename(header_path).replace("-header.csv", "")
|
||||
part_files = glob.glob(os.path.join(os.path.dirname(header_path), f"{base_name}-part*.csv"))
|
||||
|
||||
|
||||
# Find the highest numbered part file without full sorting
|
||||
highest_part = None
|
||||
highest_number = -1
|
||||
|
||||
for part_file in part_files:
|
||||
try:
|
||||
# Extract number from filename (assuming format like "part123.csv")
|
||||
file_name = os.path.basename(part_file)
|
||||
number_part = file_name.split("part")[1].split(".")[0]
|
||||
number = int(number_part)
|
||||
|
||||
if number > highest_number:
|
||||
highest_number = number
|
||||
highest_part = part_file
|
||||
except (IndexError, ValueError):
|
||||
# Skip files that don't match the expected pattern
|
||||
continue
|
||||
# Update each part file with the new columns
|
||||
for part_file in part_files:
|
||||
if part_file == highest_part:
|
||||
print(f"Skipping the highest part file: {highest_part}")
|
||||
continue
|
||||
try:
|
||||
#print("exi: ", existing_header)
|
||||
#print("out: ", out_list)
|
||||
df = self.adapt_csv_to_new_header(existing_header, out_list, part_file)
|
||||
# Read the file without headers
|
||||
|
||||
# Write back to file WITHOUT including the header
|
||||
df.to_csv(part_file, sep=self.delim, index=False, header=False)
|
||||
print(f"Updated {part_file} with new columns in correct positions")
|
||||
except Exception as e:
|
||||
print(f"Error updating {part_file}: {e}")
|
||||
|
||||
# Write the new header
|
||||
row = self.delim.join(out_list)
|
||||
f.write(row)
|
||||
|
||||
|
||||
# add file path to neo4 admin import statement (import call file
|
||||
# path may be different from actual file path)
|
||||
import_call_header_path = os.path.join(
|
||||
self.import_call_file_prefix,
|
||||
header,
|
||||
)
|
||||
import_call_parts_path = os.path.join(
|
||||
self.import_call_file_prefix,
|
||||
parts,
|
||||
)
|
||||
self.import_call_nodes.add(
|
||||
(import_call_header_path, import_call_parts_path)
|
||||
)
|
||||
|
||||
return True
|
||||
|
||||
def _write_edge_headers(self):
|
||||
"""
|
||||
Writes single CSV file for a graph entity that is represented
|
||||
as an edge as per the definition in the `schema_config.yaml`,
|
||||
containing only the header for this type of edge.
|
||||
|
||||
Returns:
|
||||
bool: The return value. True for success, False otherwise.
|
||||
"""
|
||||
# load headers from data parse
|
||||
if not self.edge_property_dict:
|
||||
logger.error(
|
||||
"Header information not found. Was the data parsed first?",
|
||||
)
|
||||
return False
|
||||
|
||||
for label, props in self.edge_property_dict.items():
|
||||
# translate label to PascalCase
|
||||
pascal_label = self.translator.name_sentence_to_pascal(
|
||||
parse_label(label)
|
||||
)
|
||||
|
||||
# paths
|
||||
header = f"{pascal_label}-header.csv"
|
||||
header_path = os.path.join(
|
||||
self.outdir,
|
||||
header,
|
||||
)
|
||||
parts = f"{pascal_label}-part.*"
|
||||
|
||||
# check for file exists
|
||||
if os.path.exists(header_path):
|
||||
logger.warning(
|
||||
f"File {header_path} already exists. Overwriting."
|
||||
)
|
||||
|
||||
# concatenate key:value in props
|
||||
props_list = []
|
||||
for k, v in props.items():
|
||||
if v in ["int", "long", "integer"]:
|
||||
props_list.append(f"{k}:long")
|
||||
elif v in ["int[]", "long[]", "integer[]"]:
|
||||
props_list.append(f"{k}:long[]")
|
||||
elif v in ["float", "double"]:
|
||||
props_list.append(f"{k}:double")
|
||||
elif v in ["float[]", "double[]"]:
|
||||
props_list.append(f"{k}:double[]")
|
||||
elif v in [
|
||||
"bool",
|
||||
"boolean",
|
||||
]: # TODO does Neo4j support bool?
|
||||
props_list.append(f"{k}:boolean")
|
||||
elif v in ["bool[]", "boolean[]"]:
|
||||
props_list.append(f"{k}:boolean[]")
|
||||
elif v in ["str[]", "string[]"]:
|
||||
props_list.append(f"{k}:string[]")
|
||||
else:
|
||||
props_list.append(f"{k}")
|
||||
|
||||
skip_id = False
|
||||
schema_label = None
|
||||
|
||||
if label in ["IS_SOURCE_OF", "IS_TARGET_OF", "IS_PART_OF"]:
|
||||
skip_id = True
|
||||
elif not self.translator.ontology.mapping.extended_schema.get(
|
||||
label
|
||||
):
|
||||
# find label in schema by label_as_edge
|
||||
for (
|
||||
k,
|
||||
v,
|
||||
) in self.translator.ontology.mapping.extended_schema.items():
|
||||
if v.get("label_as_edge") == label:
|
||||
schema_label = k
|
||||
break
|
||||
else:
|
||||
schema_label = label
|
||||
|
||||
out_list = [":START_ID"]
|
||||
|
||||
if schema_label:
|
||||
if (
|
||||
self.translator.ontology.mapping.extended_schema.get(
|
||||
schema_label
|
||||
).get("use_id")
|
||||
== False
|
||||
):
|
||||
skip_id = True
|
||||
|
||||
if not skip_id:
|
||||
out_list.append("id")
|
||||
|
||||
out_list.extend(props_list)
|
||||
out_list.extend([":END_ID", ":TYPE"])
|
||||
|
||||
existing_header = False
|
||||
# check if file already exists
|
||||
if os.path.exists(header_path):
|
||||
logger.warning(
|
||||
f"Header file `{header_path}` already exists. Overwriting.",
|
||||
)
|
||||
with open(header_path, "r", encoding="utf-8") as existing:
|
||||
existing_header = existing.read().strip().split(self.delim)
|
||||
|
||||
|
||||
with open(header_path, "w", encoding="utf-8") as f:
|
||||
# Check if header file already exists and has different columns
|
||||
if os.path.exists(header_path):
|
||||
if existing_header:
|
||||
#existing_header = existing.read().strip().split(self.delim)
|
||||
# Compare existing and new headers
|
||||
if set(existing_header) != set(out_list):
|
||||
|
||||
# Get part files associated with this header
|
||||
base_name = os.path.basename(header_path).replace("-header.csv", "")
|
||||
part_files = glob.glob(os.path.join(os.path.dirname(header_path), f"{base_name}-part*.csv"))
|
||||
|
||||
|
||||
# Find the highest numbered part file without full sorting
|
||||
highest_part = None
|
||||
highest_number = -1
|
||||
|
||||
for part_file in part_files:
|
||||
try:
|
||||
# Extract number from filename (assuming format like "part123.csv")
|
||||
file_name = os.path.basename(part_file)
|
||||
number_part = file_name.split("part")[1].split(".")[0]
|
||||
number = int(number_part)
|
||||
|
||||
if number > highest_number:
|
||||
highest_number = number
|
||||
highest_part = part_file
|
||||
except (IndexError, ValueError):
|
||||
# Skip files that don't match the expected pattern
|
||||
continue
|
||||
# Update each part file with the new columns
|
||||
for part_file in part_files:
|
||||
if part_file == highest_part:
|
||||
print(f"Skipping the highest part file: {highest_part}")
|
||||
continue
|
||||
try:
|
||||
print("exi: ", existing_header)
|
||||
print("out: ", out_list)
|
||||
df = self.adapt_csv_to_new_header(existing_header, out_list, part_file)
|
||||
# Read the file without headers
|
||||
|
||||
# Write back to file WITHOUT including the header
|
||||
df.to_csv(part_file, sep=self.delim, index=False, header=False)
|
||||
print(f"Updated {part_file} with new columns in correct positions")
|
||||
except Exception as e:
|
||||
print(f"Error updating {part_file}: {e}")
|
||||
|
||||
# Write the new header
|
||||
row = self.delim.join(out_list)
|
||||
f.write(row)
|
||||
|
||||
# add file path to neo4 admin import statement (import call file
|
||||
# path may be different from actual file path)
|
||||
import_call_header_path = os.path.join(
|
||||
self.import_call_file_prefix,
|
||||
header,
|
||||
)
|
||||
import_call_parts_path = os.path.join(
|
||||
self.import_call_file_prefix,
|
||||
parts,
|
||||
)
|
||||
self.import_call_edges.add(
|
||||
(import_call_header_path, import_call_parts_path)
|
||||
)
|
||||
|
||||
return True
|
||||
|
||||
def _get_import_script_name(self) -> str:
|
||||
"""
|
||||
Returns the name of the neo4j admin import script
|
||||
|
||||
Returns:
|
||||
str: The name of the import script (ending in .sh)
|
||||
"""
|
||||
return "neo4j-admin-import-call.sh"
|
||||
|
||||
def _construct_import_call(self) -> str:
|
||||
"""
|
||||
Function to construct the import call detailing folder and
|
||||
individual node and edge headers and data files, as well as
|
||||
delimiters and database name. Built after all data has been
|
||||
processed to ensure that nodes are called before any edges.
|
||||
|
||||
Returns:
|
||||
str: a bash command for neo4j-admin import
|
||||
"""
|
||||
import_call_neo4j_v4 = self._get_import_call(
|
||||
"import", "--database=", "--force="
|
||||
)
|
||||
import_call_neo4j_v5 = self._get_import_call(
|
||||
"database import full", "", "--overwrite-destination="
|
||||
)
|
||||
neo4j_version_check = f"version=$({self._get_default_import_call_bin_prefix()}neo4j-admin --version | cut -d '.' -f 1)"
|
||||
|
||||
import_script = f"#!/bin/bash\n{neo4j_version_check}\nif [[ $version -ge 5 ]]; then\n\t{import_call_neo4j_v5}\nelse\n\t{import_call_neo4j_v4}\nfi"
|
||||
return import_script
|
||||
|
||||
def _get_import_call(
|
||||
self, import_cmd: str, database_cmd: str, wipe_cmd: str
|
||||
) -> str:
|
||||
"""Get parametrized import call for Neo4j 4 or 5+.
|
||||
|
||||
Args:
|
||||
import_cmd (str): The import command to use.
|
||||
database_cmd (str): The database command to use.
|
||||
wipe_cmd (str): The wipe command to use.
|
||||
|
||||
Returns:
|
||||
str: The import call.
|
||||
"""
|
||||
import_call = f"{self.import_call_bin_prefix}neo4j-admin {import_cmd} "
|
||||
|
||||
import_call += f"{database_cmd}{self.db_name} "
|
||||
|
||||
import_call += f'--delimiter="{self.escaped_delim}" '
|
||||
|
||||
import_call += f'--array-delimiter="{self.escaped_adelim}" '
|
||||
|
||||
if self.quote == "'":
|
||||
import_call += f'--quote="{self.quote}" '
|
||||
else:
|
||||
import_call += f"--quote='{self.quote}' "
|
||||
|
||||
if self.wipe:
|
||||
import_call += f"{wipe_cmd}true "
|
||||
if self.skip_bad_relationships:
|
||||
import_call += "--skip-bad-relationships=true "
|
||||
if self.skip_duplicate_nodes:
|
||||
import_call += "--skip-duplicate-nodes=true "
|
||||
|
||||
# append node import calls
|
||||
for header_path, parts_path in self.import_call_nodes:
|
||||
import_call += f'--nodes="{header_path},{parts_path}" '
|
||||
|
||||
# append edge import calls
|
||||
for header_path, parts_path in self.import_call_edges:
|
||||
import_call += f'--relationships="{header_path},{parts_path}" '
|
||||
|
||||
return import_call
|
||||
|
||||
|
||||
|
||||
|
||||
def adapt_csv_to_new_header(self, old_header, new_header, csv_file_path):
|
||||
"""
|
||||
Adapt a CSV table to a new header structure, placing new columns in their correct positions.
|
||||
|
||||
Parameters:
|
||||
old_header (list): The original header columns
|
||||
new_header (list): The new header columns
|
||||
csv_file_path (str): Path to the CSV file
|
||||
|
||||
Returns:
|
||||
pandas.DataFrame: CSV data with the new header structure
|
||||
"""
|
||||
|
||||
# Step 1: Read the CSV data without headers
|
||||
df = pd.read_csv(csv_file_path, sep=self.delim, header=None)
|
||||
|
||||
# Step 2: If the file is empty, return empty DataFrame with new headers
|
||||
if df.empty:
|
||||
return pd.DataFrame(columns=new_header)
|
||||
|
||||
# Step 3: If column count doesn't match old_header length, handle the mismatch
|
||||
if len(df.columns) != len(old_header):
|
||||
print(f"Warning: CSV columns count ({len(df.columns)}) doesn't match the provided old header count ({len(old_header)})")
|
||||
# If file has fewer columns than old_header, pad with NaN
|
||||
if len(df.columns) < len(old_header):
|
||||
for i in range(len(df.columns), len(old_header)):
|
||||
df[i] = None
|
||||
# If file has more columns than old_header, truncate
|
||||
else:
|
||||
df = df.iloc[:, :len(old_header)]
|
||||
|
||||
# Step 4: Assign old header names to the dataframe
|
||||
df.columns = old_header
|
||||
|
||||
# Step 5: Create a new DataFrame with the correct structure
|
||||
new_df = pd.DataFrame(columns=new_header)
|
||||
|
||||
# Step 6: For each column in the new header, find its position in the old header
|
||||
for new_col_idx, new_col in enumerate(new_header):
|
||||
if new_col in old_header:
|
||||
# If column exists in old header, copy data
|
||||
new_df[new_col] = df[new_col]
|
||||
else:
|
||||
# If new column, add empty column
|
||||
new_df[new_col] = None
|
||||
|
||||
# Step 7: Ensure columns are in the exact order of new_header
|
||||
new_df = new_df[new_header]
|
||||
|
||||
return new_df
|
76
biocypher/output/write/graph/_networkx.py
Normal file
76
biocypher/output/write/graph/_networkx.py
Normal file
@ -0,0 +1,76 @@
|
||||
import pickle
|
||||
|
||||
import networkx as nx
|
||||
|
||||
from biocypher._logger import logger
|
||||
from biocypher.output.write._writer import _Writer
|
||||
from biocypher.output.write.relational._csv import _PandasCSVWriter
|
||||
|
||||
|
||||
class _NetworkXWriter(_Writer):
|
||||
"""
|
||||
Class for writing node and edges to a networkx DiGraph.
|
||||
"""
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
self.csv_writer = _PandasCSVWriter(*args, write_to_file=False, **kwargs)
|
||||
self.G = nx.DiGraph()
|
||||
|
||||
def _construct_import_call(self) -> str:
|
||||
"""Function to construct the Python code to load all node and edge csv files again into Pandas dfs.
|
||||
|
||||
Returns:
|
||||
str: Python code to load the csv files into Pandas dfs.
|
||||
"""
|
||||
logger.info(
|
||||
f"Writing networkx {self.G} to pickle file networkx_graph.pkl."
|
||||
)
|
||||
with open(f"{self.output_directory}/networkx_graph.pkl", "wb") as f:
|
||||
pickle.dump(self.G, f)
|
||||
|
||||
import_call = "import pickle\n"
|
||||
import_call += "with open('./networkx_graph.pkl', 'rb') as f:\n\tG_loaded = pickle.load(f)"
|
||||
return import_call
|
||||
|
||||
def _get_import_script_name(self) -> str:
|
||||
"""Function to return the name of the import script."""
|
||||
return "import_networkx.py"
|
||||
|
||||
def _write_node_data(self, nodes) -> bool:
|
||||
passed = self.csv_writer._write_entities_to_file(nodes)
|
||||
self.add_to_networkx()
|
||||
return passed
|
||||
|
||||
def _write_edge_data(self, edges) -> bool:
|
||||
passed = self.csv_writer._write_entities_to_file(edges)
|
||||
self.add_to_networkx()
|
||||
return passed
|
||||
|
||||
def add_to_networkx(self) -> bool:
|
||||
all_dfs = self.csv_writer.stored_dfs
|
||||
node_dfs = [
|
||||
df
|
||||
for df in all_dfs.values()
|
||||
if df.columns.str.contains("node_id").any()
|
||||
]
|
||||
edge_dfs = [
|
||||
df
|
||||
for df in all_dfs.values()
|
||||
if df.columns.str.contains("source_id").any()
|
||||
and df.columns.str.contains("target_id").any()
|
||||
]
|
||||
for df in node_dfs:
|
||||
nodes = df.set_index("node_id").to_dict(orient="index")
|
||||
self.G.add_nodes_from(nodes.items())
|
||||
for df in edge_dfs:
|
||||
edges = df.set_index(["source_id", "target_id"]).to_dict(
|
||||
orient="index"
|
||||
)
|
||||
self.G.add_edges_from(
|
||||
(
|
||||
(source, target, attrs)
|
||||
for (source, target), attrs in edges.items()
|
||||
)
|
||||
)
|
||||
return True
|
515
biocypher/output/write/graph/_rdf.py
Normal file
515
biocypher/output/write/graph/_rdf.py
Normal file
@ -0,0 +1,515 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
#
|
||||
# Copyright 2021, Heidelberg University Clinic
|
||||
#
|
||||
# File author(s): Loes van den Biggelaar
|
||||
# Sebastian Lobentanzer
|
||||
#
|
||||
# Distributed under MIT licence, see the file `LICENSE`.
|
||||
#
|
||||
"""
|
||||
BioCypher 'offline' module. Handles the writing of node and edge representations
|
||||
suitable for import into a DBMS.
|
||||
"""
|
||||
from types import GeneratorType
|
||||
from typing import Union
|
||||
import os
|
||||
|
||||
from rdflib import DC, RDF, RDFS, SKOS, DCTERMS, Graph, Literal, Namespace
|
||||
from rdflib.namespace import (
|
||||
_NAMESPACE_PREFIXES_CORE,
|
||||
_NAMESPACE_PREFIXES_RDFLIB,
|
||||
)
|
||||
|
||||
from biocypher._create import BioCypherEdge, BioCypherNode
|
||||
from biocypher._logger import logger
|
||||
from biocypher.output.write._batch_writer import _BatchWriter
|
||||
|
||||
|
||||
class _RDFWriter(_BatchWriter):
|
||||
"""
|
||||
Class to write BioCypher's property graph into an RDF format using
|
||||
rdflib and all the extensions it supports (RDF/XML, N3, NTriples,
|
||||
N-Quads, Turtle, TriX, Trig and JSON-LD). By default the conversion
|
||||
is done keeping only the minimum information about node and edges,
|
||||
skipping all properties.
|
||||
"""
|
||||
|
||||
def _get_import_script_name(self) -> str:
|
||||
"""
|
||||
Returns the name of the RDF admin import script.
|
||||
This function applicable for RDF export.
|
||||
|
||||
Returns:
|
||||
str: The name of the import script (ending in .sh)
|
||||
"""
|
||||
return "rdf-import-call.sh"
|
||||
|
||||
def _get_default_import_call_bin_prefix(self):
|
||||
"""
|
||||
Method to provide the default string for the import call bin prefix.
|
||||
|
||||
Returns:
|
||||
str: The default location for the RDF admin import location
|
||||
"""
|
||||
return "bin/"
|
||||
|
||||
def _is_rdf_format_supported(self, rdf_format: str) -> bool:
|
||||
"""
|
||||
Function to check if the specified RDF format is supported.
|
||||
|
||||
Args:
|
||||
rdf_format (str): The RDF format to check.
|
||||
|
||||
Returns:
|
||||
bool: Returns True if rdf format supported, False otherwise.
|
||||
"""
|
||||
supported_formats = [
|
||||
"xml",
|
||||
"n3",
|
||||
"turtle",
|
||||
"nt",
|
||||
"pretty-xml",
|
||||
"trix",
|
||||
"trig",
|
||||
"nquads",
|
||||
"json-ld",
|
||||
]
|
||||
if rdf_format not in supported_formats:
|
||||
logger.error(
|
||||
f"{rdf_format}; Incorrect or unsupported RDF format, use one of the following: "
|
||||
f'"xml", "n3", "turtle", "nt", "pretty-xml", "trix", "trig", "nquads", "json-ld" ',
|
||||
)
|
||||
return False
|
||||
else:
|
||||
# RDF graph does not support 'ttl' format, only 'turtle' format. however, the preferred file extension is always '.ttl'
|
||||
if self.rdf_format == "turtle":
|
||||
self.extension = "ttl"
|
||||
elif self.rdf_format == "ttl":
|
||||
self.rdf_format = "turtle"
|
||||
self.extension = "ttl"
|
||||
else:
|
||||
self.extension = self.rdf_format
|
||||
return True
|
||||
|
||||
def _write_single_edge_list_to_file(
|
||||
self,
|
||||
edge_list: list,
|
||||
label: str,
|
||||
prop_dict: dict,
|
||||
):
|
||||
"""
|
||||
This function takes one list of biocypher edges and writes them
|
||||
to an RDF file with the given format.
|
||||
|
||||
Args:
|
||||
edge_list (list): list of BioCypherEdges to be written
|
||||
|
||||
label (str): the label (type) of the edge
|
||||
|
||||
prop_dict (dict): properties of node class passed from parsing
|
||||
function and their types
|
||||
|
||||
Returns:
|
||||
bool: The return value. True for success, False otherwise.
|
||||
"""
|
||||
|
||||
if not all(isinstance(n, BioCypherEdge) for n in edge_list):
|
||||
logger.error("Edges must be passed as type BioCypherEdge.")
|
||||
return False
|
||||
|
||||
# translate label to PascalCase
|
||||
label_pascal = self.translator.name_sentence_to_pascal(label)
|
||||
|
||||
# create file name
|
||||
file_name = os.path.join(
|
||||
self.outdir, f"{label_pascal}.{self.extension}"
|
||||
)
|
||||
|
||||
# write data in graph
|
||||
graph = Graph()
|
||||
self._init_namespaces(graph)
|
||||
|
||||
for edge in edge_list:
|
||||
rdf_subject = edge.get_source_id()
|
||||
rdf_object = edge.get_target_id()
|
||||
rdf_predicate = edge.get_id()
|
||||
rdf_properties = edge.get_properties()
|
||||
if rdf_predicate == None:
|
||||
rdf_predicate = rdf_subject + rdf_object
|
||||
|
||||
edge_label = self.translator.name_sentence_to_pascal(
|
||||
edge.get_label()
|
||||
)
|
||||
edge_uri = self.rdf_namespaces["biocypher"][edge_label]
|
||||
graph.add((edge_uri, RDF.type, RDFS.Class))
|
||||
graph.add(
|
||||
(
|
||||
self.rdf_namespaces["biocypher"][rdf_predicate],
|
||||
RDF.type,
|
||||
edge_uri,
|
||||
)
|
||||
)
|
||||
graph.add(
|
||||
(
|
||||
self.rdf_namespaces["biocypher"][rdf_predicate],
|
||||
self.rdf_namespaces["biocypher"]["subject"],
|
||||
self.subject_to_uri(rdf_subject),
|
||||
)
|
||||
)
|
||||
graph.add(
|
||||
(
|
||||
self.rdf_namespaces["biocypher"][rdf_predicate],
|
||||
self.rdf_namespaces["biocypher"]["object"],
|
||||
self.subject_to_uri(rdf_object),
|
||||
)
|
||||
)
|
||||
|
||||
# add properties to the transformed edge --> node
|
||||
for key, value in rdf_properties.items():
|
||||
# only write value if it exists.
|
||||
if value:
|
||||
self.add_property_to_graph(graph, rdf_predicate, value, key)
|
||||
|
||||
graph.serialize(destination=file_name, format=self.rdf_format)
|
||||
|
||||
logger.info(
|
||||
f"Writing {len(edge_list)} entries to {label_pascal}.{self.rdf_format}",
|
||||
)
|
||||
|
||||
return True
|
||||
|
||||
def add_property_to_graph(
|
||||
self,
|
||||
graph: Graph,
|
||||
rdf_subject: str,
|
||||
rdf_object: str,
|
||||
rdf_predicate: str,
|
||||
):
|
||||
"""
|
||||
Function to add the properties to an RDF node. It takes the graph, the subject, object, and predicate of the RDF triple.
|
||||
It checks if the property is a list and adds it to the graph accordingly. otherwise it checks if the string represents a list.
|
||||
If it does, it transforms it to a list and adds it to the graph. if not, it adds the property to the graph as a literal.
|
||||
If the property is neither a list or string, it will also be added as a literal.
|
||||
|
||||
Args:
|
||||
graph (RDFLib.Graph): The RDF graph to add the nodes to.
|
||||
|
||||
rdf_subject (str): The subject of the RDF triple.
|
||||
|
||||
rdf_object (str): The object of the RDF triple.
|
||||
|
||||
rdf_predicate (str): The predicate of the RDF triple.
|
||||
|
||||
Returns:
|
||||
None
|
||||
"""
|
||||
if isinstance(rdf_object, list):
|
||||
for obj in rdf_object:
|
||||
graph.add(
|
||||
(
|
||||
self.subject_to_uri(rdf_subject),
|
||||
self.property_to_uri(rdf_predicate),
|
||||
Literal(obj),
|
||||
)
|
||||
)
|
||||
elif isinstance(rdf_object, str):
|
||||
if rdf_object.startswith("[") and rdf_object.endswith("]"):
|
||||
self.add_property_to_graph(
|
||||
graph,
|
||||
rdf_subject,
|
||||
self.transform_string_to_list(rdf_object),
|
||||
rdf_predicate,
|
||||
)
|
||||
else:
|
||||
graph.add(
|
||||
(
|
||||
self.subject_to_uri(rdf_subject),
|
||||
self.property_to_uri(rdf_predicate),
|
||||
Literal(rdf_object),
|
||||
)
|
||||
)
|
||||
else:
|
||||
graph.add(
|
||||
(
|
||||
self.subject_to_uri(rdf_subject),
|
||||
self.property_to_uri(rdf_predicate),
|
||||
Literal(rdf_object),
|
||||
)
|
||||
)
|
||||
|
||||
def transform_string_to_list(self, string_list: str) -> list:
|
||||
"""
|
||||
Function to transform a string representation of a list into a list.
|
||||
|
||||
Args:
|
||||
string_list (str): The string representation of the list.
|
||||
|
||||
Returns:
|
||||
list: The list representation of the input string.
|
||||
"""
|
||||
return (
|
||||
string_list.replace("[", "")
|
||||
.replace("]", "")
|
||||
.replace("'", "")
|
||||
.split(", ")
|
||||
)
|
||||
|
||||
def _write_single_node_list_to_file(
|
||||
self,
|
||||
node_list: list,
|
||||
label: str,
|
||||
prop_dict: dict,
|
||||
labels: str,
|
||||
):
|
||||
"""
|
||||
This function takes a list of BioCypherNodes and writes them
|
||||
to an RDF file in the specified format.
|
||||
|
||||
Args:
|
||||
node_list (list): A list of BioCypherNodes to be written.
|
||||
|
||||
label (str): The label (type) of the nodes.
|
||||
|
||||
prop_dict (dict): A dictionary of properties and their types for the node class.
|
||||
|
||||
Returns:
|
||||
bool: True if the writing is successful, False otherwise.
|
||||
"""
|
||||
if not all(isinstance(n, BioCypherNode) for n in node_list):
|
||||
logger.error("Nodes must be passed as type BioCypherNode.")
|
||||
return False
|
||||
|
||||
# translate label to PascalCase
|
||||
label_pascal = self.translator.name_sentence_to_pascal(label)
|
||||
|
||||
# create file name
|
||||
file_name = os.path.join(
|
||||
self.outdir, f"{label_pascal}.{self.extension}"
|
||||
)
|
||||
|
||||
# write data in graph
|
||||
graph = Graph()
|
||||
self._init_namespaces(graph)
|
||||
|
||||
for n in node_list:
|
||||
rdf_subject = n.get_id()
|
||||
rdf_object = n.get_label()
|
||||
properties = n.get_properties()
|
||||
class_name = self.translator.name_sentence_to_pascal(rdf_object)
|
||||
graph.add(
|
||||
(
|
||||
self.rdf_namespaces["biocypher"][class_name],
|
||||
RDF.type,
|
||||
RDFS.Class,
|
||||
)
|
||||
)
|
||||
graph.add(
|
||||
(
|
||||
self.subject_to_uri(rdf_subject),
|
||||
RDF.type,
|
||||
self.rdf_namespaces["biocypher"][class_name],
|
||||
)
|
||||
)
|
||||
for key, value in properties.items():
|
||||
# only write value if it exists.
|
||||
if value:
|
||||
self.add_property_to_graph(graph, rdf_subject, value, key)
|
||||
|
||||
graph.serialize(destination=file_name, format=self.rdf_format)
|
||||
|
||||
logger.info(
|
||||
f"Writing {len(node_list)} entries to {label_pascal}.{self.rdf_format}",
|
||||
)
|
||||
|
||||
return True
|
||||
|
||||
def write_nodes(
|
||||
self, nodes, batch_size: int = int(1e6), force: bool = False
|
||||
) -> bool:
|
||||
"""
|
||||
Wrapper for writing nodes in RDF format. It calls the _write_node_data() function, specifying the node data.
|
||||
|
||||
Args:
|
||||
nodes (list or generator): A list or generator of nodes in BioCypherNode format.
|
||||
batch_size (int): The number of nodes to write in each batch.
|
||||
force (bool): Flag to force the writing even if the output file already exists.
|
||||
|
||||
Returns:
|
||||
bool: True if the writing is successful, False otherwise.
|
||||
"""
|
||||
# check if specified output format is correct
|
||||
passed = self._is_rdf_format_supported(self.rdf_format)
|
||||
if not passed:
|
||||
logger.error("Error while writing node data, wrong RDF format")
|
||||
return False
|
||||
# write node data using _write_node_data method
|
||||
passed = self._write_node_data(nodes, batch_size, force)
|
||||
if not passed:
|
||||
logger.error("Error while writing node data.")
|
||||
return False
|
||||
return True
|
||||
|
||||
def write_edges(
|
||||
self,
|
||||
edges: Union[list, GeneratorType],
|
||||
batch_size: int = int(1e6),
|
||||
) -> bool:
|
||||
"""
|
||||
Wrapper for writing edges in RDF format. It calls _write_edge_data()
|
||||
functions specifying it's edge data.
|
||||
|
||||
Args:
|
||||
edges (BioCypherEdge): a list or generator of edges in
|
||||
:py:class:`BioCypherEdge` format
|
||||
batch_size (int): The number of edges to write in each batch.
|
||||
|
||||
Returns:
|
||||
bool: The return value. True for success, False otherwise.
|
||||
"""
|
||||
# check if specified output format is correct
|
||||
passed = self._is_rdf_format_supported(self.rdf_format)
|
||||
if not passed:
|
||||
logger.error("Error while writing edge data, wrong RDF format")
|
||||
return False
|
||||
# write edge data using _write_edge_data method
|
||||
passed = self._write_edge_data(edges, batch_size=batch_size)
|
||||
if not passed:
|
||||
logger.error("Error while writing edge data.")
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
def _construct_import_call(self) -> bool:
|
||||
"""
|
||||
Function to write the import call.
|
||||
This function is not applicable for RDF.
|
||||
|
||||
Returns:
|
||||
bool: The return value. True for success, False otherwise.
|
||||
"""
|
||||
return ""
|
||||
|
||||
def _write_array_string(self, string_list):
|
||||
"""
|
||||
Abstract method to write the string representation of an array into a .csv file
|
||||
as required by the RDF admin-import.
|
||||
This function is not applicable for RDF.
|
||||
|
||||
Args:
|
||||
string_list (list): list of ontology strings
|
||||
|
||||
Returns:
|
||||
str: The string representation of an array for the neo4j admin import
|
||||
"""
|
||||
|
||||
return True
|
||||
|
||||
def _write_node_headers(self):
|
||||
"""
|
||||
Abstract method that takes care of importing properties of a graph entity that is represented
|
||||
as a node as per the definition in the `schema_config.yaml`
|
||||
This function is not applicable for RDF.
|
||||
|
||||
Returns:
|
||||
bool: The return value. True for success, False otherwise.
|
||||
"""
|
||||
return True
|
||||
|
||||
def _write_edge_headers(self):
|
||||
"""
|
||||
Abstract method to write a database import-file for a graph entity that is represented
|
||||
as an edge as per the definition in the `schema_config.yaml`,
|
||||
containing only the header for this type of edge.
|
||||
This function is not applicable for RDF.
|
||||
|
||||
Returns:
|
||||
bool: The return value. True for success, False otherwise.
|
||||
"""
|
||||
return True
|
||||
|
||||
def subject_to_uri(self, subject: str) -> str:
|
||||
"""
|
||||
Converts the subject to a proper URI using the available namespaces.
|
||||
If the conversion fails, it defaults to the biocypher prefix.
|
||||
|
||||
Args:
|
||||
subject (str): The subject to be converted to a URI.
|
||||
|
||||
Returns:
|
||||
str: The corresponding URI for the subject.
|
||||
"""
|
||||
try:
|
||||
_pref, _id = subject.split(":")
|
||||
|
||||
if _pref in self.rdf_namespaces.keys():
|
||||
return self.rdf_namespaces[_pref][_id]
|
||||
else:
|
||||
return self.rdf_namespaces["biocypher"][subject]
|
||||
except ValueError:
|
||||
return self.rdf_namespaces["biocypher"][subject]
|
||||
|
||||
def property_to_uri(self, property_name: str) -> dict[str, str]:
|
||||
"""
|
||||
Converts a property name to its corresponding URI.
|
||||
|
||||
This function takes a property name and searches for its corresponding URI in various namespaces.
|
||||
It first checks the core namespaces for rdflib, including owl, rdf, rdfs, xsd, and xml.
|
||||
|
||||
Args:
|
||||
property_name (str): The property name to be converted to a URI.
|
||||
|
||||
Returns:
|
||||
str: The corresponding URI for the input property name.
|
||||
"""
|
||||
# These namespaces are core for rdflib; owl, rdf, rdfs, xsd and xml
|
||||
for namespace in _NAMESPACE_PREFIXES_CORE.values():
|
||||
if property_name in namespace:
|
||||
return namespace[property_name]
|
||||
|
||||
# If the property name is not found in the core namespaces, search in the SKOS, DC, and DCTERMS namespaces
|
||||
for namespace in [SKOS, DC, DCTERMS]:
|
||||
if property_name in namespace:
|
||||
return namespace[property_name]
|
||||
|
||||
# If the property name is still not found, try other namespaces from rdflib.
|
||||
for namespace in _NAMESPACE_PREFIXES_RDFLIB.values():
|
||||
if property_name in namespace:
|
||||
return namespace[property_name]
|
||||
|
||||
# If the property name is "licence", it recursively calls the function with "license" as the input.
|
||||
if property_name == "licence":
|
||||
return self.property_to_uri("license")
|
||||
|
||||
# TODO: add an option to search trough manually implemented namespaces
|
||||
|
||||
# If the input is not found in any of the namespaces, it returns the corresponding URI from the biocypher namespace.
|
||||
# TODO: give a warning and try to prevent this option altogether
|
||||
return self.rdf_namespaces["biocypher"][property_name]
|
||||
|
||||
def _init_namespaces(self, graph: Graph):
|
||||
"""
|
||||
Initializes the namespaces for the RDF graph. These namespaces are used to convert nodes to URIs.
|
||||
|
||||
This function adds the biocypher standard namespace to the `rdf_namespaces` attribute of the class.
|
||||
If `rdf_namespaces` is empty, it sets it to the biocypher standard namespace. Otherwise, it merges
|
||||
the biocypher standard namespace with the namespaces defined in the biocypher_config.yaml.
|
||||
|
||||
Args:
|
||||
graph (RDFLib.Graph): The RDF graph to bind the namespaces to.
|
||||
|
||||
Returns:
|
||||
None
|
||||
"""
|
||||
# add biocypher standard to self.rdf_namespaces
|
||||
biocypher_standard = {"biocypher": "https://biocypher.org/biocypher#"}
|
||||
if not self.rdf_namespaces:
|
||||
self.rdf_namespaces = biocypher_standard
|
||||
else:
|
||||
self.rdf_namespaces = self.rdf_namespaces | biocypher_standard
|
||||
|
||||
for key, value in self.rdf_namespaces.items():
|
||||
namespace = Namespace(value)
|
||||
self.rdf_namespaces[key] = namespace
|
||||
graph.bind(key, namespace)
|
0
biocypher/output/write/relational/__init__.py
Normal file
0
biocypher/output/write/relational/__init__.py
Normal file
76
biocypher/output/write/relational/_csv.py
Normal file
76
biocypher/output/write/relational/_csv.py
Normal file
@ -0,0 +1,76 @@
|
||||
from more_itertools import peekable
|
||||
|
||||
from biocypher._logger import logger
|
||||
from biocypher.output.write._writer import _Writer
|
||||
from biocypher.output.in_memory._pandas import Pandas
|
||||
|
||||
|
||||
class _PandasCSVWriter(_Writer):
|
||||
"""
|
||||
Class for writing node and edge representations to a CSV file.
|
||||
"""
|
||||
|
||||
def __init__(self, *args, write_to_file: bool = True, **kwargs):
|
||||
kwargs["write_to_file"] = write_to_file
|
||||
super().__init__(*args, **kwargs)
|
||||
self.in_memory_dfs = {}
|
||||
self.stored_dfs = {}
|
||||
self.pandas_in_memory = Pandas(
|
||||
translator=self.translator,
|
||||
deduplicator=self.deduplicator,
|
||||
)
|
||||
self.delimiter = kwargs.get("delimiter")
|
||||
if not self.delimiter:
|
||||
self.delimiter = ","
|
||||
self.write_to_file = write_to_file
|
||||
|
||||
def _construct_import_call(self) -> str:
|
||||
"""Function to construct the Python code to load all node and edge csv files again into Pandas dfs.
|
||||
|
||||
Returns:
|
||||
str: Python code to load the csv files into Pandas dfs.
|
||||
"""
|
||||
import_call = "import pandas as pd\n\n"
|
||||
for df_name in self.stored_dfs.keys():
|
||||
import_call += f"{df_name} = pd.read_csv('./{df_name}.csv', header=0, index_col=0)\n"
|
||||
return import_call
|
||||
|
||||
def _get_import_script_name(self) -> str:
|
||||
"""Function to return the name of the import script."""
|
||||
return "import_pandas_csv.py"
|
||||
|
||||
def _write_node_data(self, nodes) -> bool:
|
||||
passed = self._write_entities_to_file(nodes)
|
||||
return passed
|
||||
|
||||
def _write_edge_data(self, edges) -> bool:
|
||||
passed = self._write_entities_to_file(edges)
|
||||
return passed
|
||||
|
||||
def _write_entities_to_file(self, entities: iter) -> bool:
|
||||
"""Function to output.write the entities to a CSV file.
|
||||
|
||||
Args:
|
||||
entities (iterable): An iterable of BioCypherNode / BioCypherEdge / BioCypherRelAsNode objects.
|
||||
"""
|
||||
entities = peekable(entities)
|
||||
entity_list = self.pandas_in_memory._separate_entity_types(entities)
|
||||
for entity_type, entities in entity_list.items():
|
||||
self.in_memory_dfs[
|
||||
entity_type
|
||||
] = self.pandas_in_memory._add_entity_df(entity_type, entities)
|
||||
for entity_type in self.in_memory_dfs.keys():
|
||||
entity_df = self.in_memory_dfs[entity_type]
|
||||
if " " in entity_type or "." in entity_type:
|
||||
entity_type = entity_type.replace(" ", "_").replace(".", "_")
|
||||
if self.write_to_file:
|
||||
logger.info(
|
||||
f"Writing {entity_df.shape[0]} entries to {entity_type}.csv."
|
||||
)
|
||||
entity_df.to_csv(
|
||||
f"{self.output_directory}/{entity_type}.csv",
|
||||
sep=self.delimiter,
|
||||
)
|
||||
self.stored_dfs[entity_type] = entity_df
|
||||
self.in_memory_dfs = {}
|
||||
return True
|
320
biocypher/output/write/relational/_postgresql.py
Normal file
320
biocypher/output/write/relational/_postgresql.py
Normal file
@ -0,0 +1,320 @@
|
||||
import os
|
||||
import glob
|
||||
|
||||
from biocypher._logger import logger
|
||||
from biocypher.output.write._batch_writer import _BatchWriter
|
||||
|
||||
|
||||
class _PostgreSQLBatchWriter(_BatchWriter):
|
||||
"""
|
||||
Class for writing node and edge representations to disk using the
|
||||
format specified by PostgreSQL for the use of "COPY FROM...". Each batch
|
||||
writer instance has a fixed representation that needs to be passed
|
||||
at instantiation via the :py:attr:`schema` argument. The instance
|
||||
also expects an ontology adapter via :py:attr:`ontology_adapter` to be able
|
||||
to convert and extend the hierarchy.
|
||||
|
||||
This class inherits from the abstract class "_BatchWriter" and implements the
|
||||
PostgreSQL-specific methods:
|
||||
|
||||
- _write_node_headers
|
||||
- _write_edge_headers
|
||||
- _construct_import_call
|
||||
- _write_array_string
|
||||
"""
|
||||
|
||||
DATA_TYPE_LOOKUP = {
|
||||
"str": "VARCHAR", # VARCHAR needs limit
|
||||
"int": "INTEGER",
|
||||
"long": "BIGINT",
|
||||
"float": "NUMERIC",
|
||||
"double": "NUMERIC",
|
||||
"dbl": "NUMERIC",
|
||||
"boolean": "BOOLEAN",
|
||||
"str[]": "VARCHAR[]",
|
||||
"string[]": "VARCHAR[]",
|
||||
}
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
self._copy_from_csv_commands = set()
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
def _get_default_import_call_bin_prefix(self):
|
||||
"""
|
||||
Method to provide the default string for the import call bin prefix.
|
||||
|
||||
Returns:
|
||||
str: The default location for the psql command
|
||||
"""
|
||||
return ""
|
||||
|
||||
def _get_data_type(self, string) -> str:
|
||||
try:
|
||||
return self.DATA_TYPE_LOOKUP[string]
|
||||
except KeyError:
|
||||
logger.info(
|
||||
'Could not determine data type {string}. Using default "VARCHAR"'
|
||||
)
|
||||
return "VARCHAR"
|
||||
|
||||
def _write_array_string(self, string_list) -> str:
|
||||
"""
|
||||
Abstract method to output.write the string representation of an array into a .csv file
|
||||
as required by the postgresql COPY command, with '{','}' brackets and ',' separation.
|
||||
|
||||
Args:
|
||||
string_list (list): list of ontology strings
|
||||
|
||||
Returns:
|
||||
str: The string representation of an array for postgres COPY
|
||||
"""
|
||||
string = ",".join(string_list)
|
||||
string = f'"{{{string}}}"'
|
||||
return string
|
||||
|
||||
def _get_import_script_name(self) -> str:
|
||||
"""
|
||||
Returns the name of the psql import script
|
||||
|
||||
Returns:
|
||||
str: The name of the import script (ending in .sh)
|
||||
"""
|
||||
return f"{self.db_name}-import-call.sh"
|
||||
|
||||
def _adjust_pascal_to_psql(self, string):
|
||||
string = string.replace(".", "_")
|
||||
string = string.lower()
|
||||
return string
|
||||
|
||||
def _write_node_headers(self):
|
||||
"""
|
||||
Writes single CSV file for a graph entity that is represented
|
||||
as a node as per the definition in the `schema_config.yaml`,
|
||||
containing only the header for this type of node.
|
||||
|
||||
Returns:
|
||||
bool: The return value. True for success, False otherwise.
|
||||
"""
|
||||
# load headers from data parse
|
||||
if not self.node_property_dict:
|
||||
logger.error(
|
||||
"Header information not found. Was the data parsed first?",
|
||||
)
|
||||
return False
|
||||
|
||||
for label, props in self.node_property_dict.items():
|
||||
# create header CSV with ID, properties, labels
|
||||
|
||||
# translate label to PascalCase
|
||||
pascal_label = self.translator.name_sentence_to_pascal(label)
|
||||
|
||||
parts = f"{pascal_label}-part*.csv"
|
||||
parts_paths = os.path.join(self.outdir, parts)
|
||||
parts_paths = glob.glob(parts_paths)
|
||||
parts_paths.sort()
|
||||
|
||||
# adjust label for import to psql
|
||||
pascal_label = self._adjust_pascal_to_psql(pascal_label)
|
||||
table_create_command_path = os.path.join(
|
||||
self.outdir,
|
||||
f"{pascal_label}-create_table.sql",
|
||||
)
|
||||
|
||||
# check if file already exists
|
||||
if os.path.exists(table_create_command_path):
|
||||
logger.warning(
|
||||
f"File {table_create_command_path} already exists. Overwriting.",
|
||||
)
|
||||
|
||||
# concatenate key:value in props
|
||||
columns = ["_ID VARCHAR"]
|
||||
for col_name, col_type in props.items():
|
||||
col_type = self._get_data_type(col_type)
|
||||
col_name = self._adjust_pascal_to_psql(col_name)
|
||||
columns.append(f"{col_name} {col_type}")
|
||||
columns.append("_LABEL VARCHAR[]")
|
||||
|
||||
with open(table_create_command_path, "w", encoding="utf-8") as f:
|
||||
command = ""
|
||||
if self.wipe:
|
||||
command += f"DROP TABLE IF EXISTS {pascal_label};\n"
|
||||
|
||||
# table creation requires comma separation
|
||||
command += (
|
||||
f'CREATE TABLE {pascal_label}({",".join(columns)});\n'
|
||||
)
|
||||
f.write(command)
|
||||
|
||||
for parts_path in parts_paths:
|
||||
# if import_call_file_prefix is set, replace actual path
|
||||
# with prefix
|
||||
if self.import_call_file_prefix != self.outdir:
|
||||
parts_path = parts_path.replace(
|
||||
self.outdir,
|
||||
self.import_call_file_prefix,
|
||||
)
|
||||
|
||||
self._copy_from_csv_commands.add(
|
||||
f"\\copy {pascal_label} FROM '{parts_path}' DELIMITER E'{self.delim}' CSV;"
|
||||
)
|
||||
|
||||
# add file path to import statement
|
||||
# if import_call_file_prefix is set, replace actual path
|
||||
# with prefix
|
||||
if self.import_call_file_prefix != self.outdir:
|
||||
table_create_command_path = table_create_command_path.replace(
|
||||
self.outdir,
|
||||
self.import_call_file_prefix,
|
||||
)
|
||||
|
||||
self.import_call_nodes.add(table_create_command_path)
|
||||
|
||||
return True
|
||||
|
||||
def _write_edge_headers(self):
|
||||
"""
|
||||
Writes single CSV file for a graph entity that is represented
|
||||
as an edge as per the definition in the `schema_config.yaml`,
|
||||
containing only the header for this type of edge.
|
||||
|
||||
Returns:
|
||||
bool: The return value. True for success, False otherwise.
|
||||
"""
|
||||
# load headers from data parse
|
||||
if not self.edge_property_dict:
|
||||
logger.error(
|
||||
"Header information not found. Was the data parsed first?",
|
||||
)
|
||||
return False
|
||||
|
||||
for label, props in self.edge_property_dict.items():
|
||||
# translate label to PascalCase
|
||||
pascal_label = self.translator.name_sentence_to_pascal(label)
|
||||
|
||||
parts_paths = os.path.join(self.outdir, f"{pascal_label}-part*.csv")
|
||||
parts_paths = glob.glob(parts_paths)
|
||||
parts_paths.sort()
|
||||
|
||||
# adjust label for import to psql
|
||||
pascal_label = self._adjust_pascal_to_psql(pascal_label)
|
||||
table_create_command_path = os.path.join(
|
||||
self.outdir,
|
||||
f"{pascal_label}-create_table.sql",
|
||||
)
|
||||
|
||||
# check for file exists
|
||||
if os.path.exists(table_create_command_path):
|
||||
logger.warning(
|
||||
f"File {table_create_command_path} already exists. Overwriting.",
|
||||
)
|
||||
|
||||
# concatenate key:value in props
|
||||
columns = []
|
||||
for col_name, col_type in props.items():
|
||||
col_type = self._get_data_type(col_type)
|
||||
col_name = self._adjust_pascal_to_psql(col_name)
|
||||
if col_name == "_ID":
|
||||
# should ideally never happen
|
||||
raise ValueError(
|
||||
"Column name '_ID' is reserved for internal use, "
|
||||
"denoting the relationship ID. Please choose a "
|
||||
"different name for your column."
|
||||
)
|
||||
|
||||
columns.append(f"{col_name} {col_type}")
|
||||
|
||||
# create list of lists and flatten
|
||||
# removes need for empty check of property list
|
||||
out_list = [
|
||||
"_START_ID VARCHAR",
|
||||
"_ID VARCHAR",
|
||||
*columns,
|
||||
"_END_ID VARCHAR",
|
||||
"_TYPE VARCHAR",
|
||||
]
|
||||
|
||||
with open(table_create_command_path, "w", encoding="utf-8") as f:
|
||||
command = ""
|
||||
if self.wipe:
|
||||
command += f"DROP TABLE IF EXISTS {pascal_label};\n"
|
||||
|
||||
# table creation requires comma separation
|
||||
command += (
|
||||
f'CREATE TABLE {pascal_label}({",".join(out_list)});\n'
|
||||
)
|
||||
f.write(command)
|
||||
|
||||
for parts_path in parts_paths:
|
||||
# if import_call_file_prefix is set, replace actual path
|
||||
# with prefix
|
||||
if self.import_call_file_prefix != self.outdir:
|
||||
parts_path = parts_path.replace(
|
||||
self.outdir,
|
||||
self.import_call_file_prefix,
|
||||
)
|
||||
|
||||
self._copy_from_csv_commands.add(
|
||||
f"\\copy {pascal_label} FROM '{parts_path}' DELIMITER E'{self.delim}' CSV;"
|
||||
)
|
||||
|
||||
# add file path to import statement
|
||||
# if import_call_file_prefix is set, replace actual path
|
||||
# with prefix
|
||||
if self.import_call_file_prefix != self.outdir:
|
||||
table_create_command_path = table_create_command_path.replace(
|
||||
self.outdir,
|
||||
self.import_call_file_prefix,
|
||||
)
|
||||
|
||||
self.import_call_edges.add(table_create_command_path)
|
||||
|
||||
return True
|
||||
|
||||
def _construct_import_call(self) -> str:
|
||||
"""
|
||||
Function to construct the import call detailing folder and
|
||||
individual node and edge headers and data files, as well as
|
||||
delimiters and database name. Built after all data has been
|
||||
processed to ensure that nodes are called before any edges.
|
||||
|
||||
Returns:
|
||||
str: a bash command for postgresql import
|
||||
"""
|
||||
import_call = ""
|
||||
|
||||
# create tables
|
||||
# At this point, csv files of nodes and edges do not require differentiation
|
||||
for import_file_path in [
|
||||
*self.import_call_nodes,
|
||||
*self.import_call_edges,
|
||||
]:
|
||||
import_call += f'echo "Setup {import_file_path}..."\n'
|
||||
if {self.db_password}:
|
||||
# set password variable inline
|
||||
import_call += f"PGPASSWORD={self.db_password} "
|
||||
import_call += (
|
||||
f"{self.import_call_bin_prefix}psql -f {import_file_path}"
|
||||
)
|
||||
import_call += f" --dbname {self.db_name}"
|
||||
import_call += f" --host {self.db_host}"
|
||||
import_call += f" --port {self.db_port}"
|
||||
import_call += f" --user {self.db_user}"
|
||||
import_call += '\necho "Done!"\n'
|
||||
import_call += "\n"
|
||||
|
||||
# copy data to tables
|
||||
for command in self._copy_from_csv_commands:
|
||||
table_part = command.split(" ")[3]
|
||||
import_call += f'echo "Importing {table_part}..."\n'
|
||||
if {self.db_password}:
|
||||
# set password variable inline
|
||||
import_call += f"PGPASSWORD={self.db_password} "
|
||||
import_call += f'{self.import_call_bin_prefix}psql -c "{command}"'
|
||||
import_call += f" --dbname {self.db_name}"
|
||||
import_call += f" --host {self.db_host}"
|
||||
import_call += f" --port {self.db_port}"
|
||||
import_call += f" --user {self.db_user}"
|
||||
import_call += '\necho "Done!"\n'
|
||||
import_call += "\n"
|
||||
|
||||
return import_call
|
51
biocypher/output/write/relational/_sqlite.py
Normal file
51
biocypher/output/write/relational/_sqlite.py
Normal file
@ -0,0 +1,51 @@
|
||||
from biocypher.output.write.relational._postgresql import _PostgreSQLBatchWriter
|
||||
|
||||
|
||||
class _SQLiteBatchWriter(_PostgreSQLBatchWriter):
|
||||
"""
|
||||
Class for writing node and edge representations to a SQLite database.
|
||||
It uses the _PostgreSQLBatchWriter class under the hood, which already
|
||||
implements the logic to write the nodes/edges to a relational DBMS.
|
||||
Only the import bash script differs between PostgreSQL and SQLite
|
||||
and is therefore implemented in this class.
|
||||
|
||||
- _construct_import_call
|
||||
"""
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
def _construct_import_call(self) -> str:
|
||||
"""
|
||||
Function to construct the import call detailing folder and
|
||||
individual node and edge headers and data files, as well as
|
||||
delimiters and database name. Built after all data has been
|
||||
processed to ensure that nodes are called before any edges.
|
||||
|
||||
Returns:
|
||||
str: a bash command for sqlite import
|
||||
"""
|
||||
import_call = ""
|
||||
|
||||
# create tables
|
||||
# At this point, csv files of nodes and edges do not require differentiation
|
||||
for import_file_path in [
|
||||
*self.import_call_nodes,
|
||||
*self.import_call_edges,
|
||||
]:
|
||||
import_call += f'echo "Setup {import_file_path}..."\n'
|
||||
import_call += f"{self.import_call_bin_prefix}sqlite3 {self.db_name} < {import_file_path}"
|
||||
import_call += '\necho "Done!"\n'
|
||||
import_call += "\n"
|
||||
|
||||
for command in self._copy_from_csv_commands:
|
||||
table_name = command.split(" ")[1]
|
||||
table_part = command.split(" ")[3].replace("'", "")
|
||||
import_call += f'echo "Importing {table_part}..."\n'
|
||||
separator = self.delim
|
||||
import_part = f".import {table_part} {table_name}"
|
||||
import_call += f"{self.import_call_bin_prefix}sqlite3 -separator $'{separator}' {self.db_name} \"{import_part}\""
|
||||
import_call += '\necho "Done!"\n'
|
||||
import_call += "\n"
|
||||
|
||||
return import_call
|
Reference in New Issue
Block a user