2025-04-16 22:12:19 +02:00

242 lines
7.9 KiB
Python

import os
from biocypher._logger import logger
from biocypher.output.write.graph._neo4j import _Neo4jBatchWriter
class _ArangoDBBatchWriter(_Neo4jBatchWriter):
"""
Class for writing node and edge representations to disk using the format
specified by ArangoDB for the use of "arangoimport". Output files are
similar to Neo4j, but with a different header format.
"""
def _get_default_import_call_bin_prefix(self):
"""
Method to provide the default string for the import call bin prefix.
Returns:
str: The default location for the neo4j admin import location
"""
return ""
def _get_import_script_name(self) -> str:
"""
Returns the name of the neo4j admin import script
Returns:
str: The name of the import script (ending in .sh)
"""
return "arangodb-import-call.sh"
def _write_node_headers(self):
"""
Writes single CSV file for a graph entity that is represented
as a node as per the definition in the `schema_config.yaml`,
containing only the header for this type of node.
Returns:
bool: The return value. True for success, False otherwise.
"""
# load headers from data parse
if not self.node_property_dict:
logger.error(
"Header information not found. Was the data parsed first?",
)
return False
for label, props in self.node_property_dict.items():
# create header CSV with ID, properties, labels
_id = "_key"
# translate label to PascalCase
pascal_label = self.translator.name_sentence_to_pascal(label)
header = f"{pascal_label}-header.csv"
header_path = os.path.join(
self.outdir,
header,
)
# check if file already exists
if os.path.exists(header_path):
logger.warning(
f"File {header_path} already exists. Overwriting."
)
# concatenate key:value in props
props_list = []
for k in props.keys():
props_list.append(f"{k}")
# create list of lists and flatten
# removes need for empty check of property list
out_list = [[_id], props_list]
out_list = [val for sublist in out_list for val in sublist]
with open(header_path, "w", encoding="utf-8") as f:
# concatenate with delimiter
row = self.delim.join(out_list)
f.write(row)
# add collection from schema config
collection = self.translator.ontology.mapping.extended_schema[
label
].get("db_collection_name", None)
# add file path to neo4 admin import statement
# do once for each part file
parts = self.parts.get(label, [])
if not parts:
raise ValueError(
f"No parts found for node label {label}. "
f"Check that the data was parsed first.",
)
for part in parts:
import_call_header_path = os.path.join(
self.import_call_file_prefix,
header,
)
import_call_parts_path = os.path.join(
self.import_call_file_prefix,
part,
)
self.import_call_nodes.add(
(
import_call_header_path,
import_call_parts_path,
collection,
)
)
return True
def _write_edge_headers(self):
"""
Writes single CSV file for a graph entity that is represented
as an edge as per the definition in the `schema_config.yaml`,
containing only the header for this type of edge.
Returns:
bool: The return value. True for success, False otherwise.
"""
# load headers from data parse
if not self.edge_property_dict:
logger.error(
"Header information not found. Was the data parsed first?",
)
return False
for label, props in self.edge_property_dict.items():
# translate label to PascalCase
pascal_label = self.translator.name_sentence_to_pascal(label)
# paths
header = f"{pascal_label}-header.csv"
header_path = os.path.join(
self.outdir,
header,
)
parts = f"{pascal_label}-part.*"
# check for file exists
if os.path.exists(header_path):
logger.warning(
f"Header file {header_path} already exists. Overwriting."
)
# concatenate key:value in props
props_list = []
for k in props.keys():
props_list.append(f"{k}")
out_list = ["_from", "_key", *props_list, "_to"]
with open(header_path, "w", encoding="utf-8") as f:
# concatenate with delimiter
row = self.delim.join(out_list)
f.write(row)
# add collection from schema config
if not self.translator.ontology.mapping.extended_schema.get(label):
for (
_,
v,
) in self.translator.ontology.mapping.extended_schema.items():
if v.get("label_as_edge") == label:
collection = v.get("db_collection_name", None)
break
else:
collection = self.translator.ontology.mapping.extended_schema[
label
].get("db_collection_name", None)
# add file path to neo4 admin import statement (import call path
# may be different from actual output path)
header_import_call_path = os.path.join(
self.import_call_file_prefix,
header,
)
parts_import_call_path = os.path.join(
self.import_call_file_prefix,
parts,
)
self.import_call_edges.add(
(
header_import_call_path,
parts_import_call_path,
collection,
)
)
return True
def _construct_import_call(self) -> str:
"""
Function to construct the import call detailing folder and
individual node and edge headers and data files, as well as
delimiters and database name. Built after all data has been
processed to ensure that nodes are called before any edges.
Returns:
str: a bash command for neo4j-admin import
"""
import_call = (
f"{self.import_call_bin_prefix}arangoimp "
f"--type csv "
f'--separator="{self.escaped_delim}" '
)
if self.quote == "'":
import_call += f'--quote="{self.quote}" '
else:
import_call += f"--quote='{self.quote}' "
node_lines = ""
# node import calls: one line per node type
for header_path, parts_path, collection in self.import_call_nodes:
line = (
f"{import_call} "
f"--headers-file {header_path} "
f"--file= {parts_path} "
)
if collection:
line += f"--create-collection --collection {collection} "
node_lines += f"{line}\n"
edge_lines = ""
# edge import calls: one line per edge type
for header_path, parts_path, collection in self.import_call_edges:
import_call += f'--relationships="{header_path},{parts_path}" '
return node_lines + edge_lines