242 lines
7.9 KiB
Python
242 lines
7.9 KiB
Python
import os
|
|
|
|
from biocypher._logger import logger
|
|
from biocypher.output.write.graph._neo4j import _Neo4jBatchWriter
|
|
|
|
|
|
class _ArangoDBBatchWriter(_Neo4jBatchWriter):
|
|
"""
|
|
Class for writing node and edge representations to disk using the format
|
|
specified by ArangoDB for the use of "arangoimport". Output files are
|
|
similar to Neo4j, but with a different header format.
|
|
"""
|
|
|
|
def _get_default_import_call_bin_prefix(self):
|
|
"""
|
|
Method to provide the default string for the import call bin prefix.
|
|
|
|
Returns:
|
|
str: The default location for the neo4j admin import location
|
|
"""
|
|
return ""
|
|
|
|
def _get_import_script_name(self) -> str:
|
|
"""
|
|
Returns the name of the neo4j admin import script
|
|
|
|
Returns:
|
|
str: The name of the import script (ending in .sh)
|
|
"""
|
|
return "arangodb-import-call.sh"
|
|
|
|
def _write_node_headers(self):
|
|
"""
|
|
Writes single CSV file for a graph entity that is represented
|
|
as a node as per the definition in the `schema_config.yaml`,
|
|
containing only the header for this type of node.
|
|
|
|
Returns:
|
|
bool: The return value. True for success, False otherwise.
|
|
"""
|
|
# load headers from data parse
|
|
if not self.node_property_dict:
|
|
logger.error(
|
|
"Header information not found. Was the data parsed first?",
|
|
)
|
|
return False
|
|
|
|
for label, props in self.node_property_dict.items():
|
|
# create header CSV with ID, properties, labels
|
|
|
|
_id = "_key"
|
|
|
|
# translate label to PascalCase
|
|
pascal_label = self.translator.name_sentence_to_pascal(label)
|
|
|
|
header = f"{pascal_label}-header.csv"
|
|
header_path = os.path.join(
|
|
self.outdir,
|
|
header,
|
|
)
|
|
|
|
# check if file already exists
|
|
if os.path.exists(header_path):
|
|
logger.warning(
|
|
f"File {header_path} already exists. Overwriting."
|
|
)
|
|
|
|
# concatenate key:value in props
|
|
props_list = []
|
|
for k in props.keys():
|
|
props_list.append(f"{k}")
|
|
|
|
# create list of lists and flatten
|
|
# removes need for empty check of property list
|
|
out_list = [[_id], props_list]
|
|
out_list = [val for sublist in out_list for val in sublist]
|
|
|
|
with open(header_path, "w", encoding="utf-8") as f:
|
|
# concatenate with delimiter
|
|
row = self.delim.join(out_list)
|
|
f.write(row)
|
|
|
|
# add collection from schema config
|
|
collection = self.translator.ontology.mapping.extended_schema[
|
|
label
|
|
].get("db_collection_name", None)
|
|
|
|
# add file path to neo4 admin import statement
|
|
# do once for each part file
|
|
parts = self.parts.get(label, [])
|
|
|
|
if not parts:
|
|
raise ValueError(
|
|
f"No parts found for node label {label}. "
|
|
f"Check that the data was parsed first.",
|
|
)
|
|
|
|
for part in parts:
|
|
import_call_header_path = os.path.join(
|
|
self.import_call_file_prefix,
|
|
header,
|
|
)
|
|
import_call_parts_path = os.path.join(
|
|
self.import_call_file_prefix,
|
|
part,
|
|
)
|
|
|
|
self.import_call_nodes.add(
|
|
(
|
|
import_call_header_path,
|
|
import_call_parts_path,
|
|
collection,
|
|
)
|
|
)
|
|
|
|
return True
|
|
|
|
def _write_edge_headers(self):
|
|
"""
|
|
Writes single CSV file for a graph entity that is represented
|
|
as an edge as per the definition in the `schema_config.yaml`,
|
|
containing only the header for this type of edge.
|
|
|
|
Returns:
|
|
bool: The return value. True for success, False otherwise.
|
|
"""
|
|
# load headers from data parse
|
|
if not self.edge_property_dict:
|
|
logger.error(
|
|
"Header information not found. Was the data parsed first?",
|
|
)
|
|
return False
|
|
|
|
for label, props in self.edge_property_dict.items():
|
|
# translate label to PascalCase
|
|
pascal_label = self.translator.name_sentence_to_pascal(label)
|
|
|
|
# paths
|
|
header = f"{pascal_label}-header.csv"
|
|
header_path = os.path.join(
|
|
self.outdir,
|
|
header,
|
|
)
|
|
parts = f"{pascal_label}-part.*"
|
|
|
|
# check for file exists
|
|
if os.path.exists(header_path):
|
|
logger.warning(
|
|
f"Header file {header_path} already exists. Overwriting."
|
|
)
|
|
|
|
# concatenate key:value in props
|
|
props_list = []
|
|
for k in props.keys():
|
|
props_list.append(f"{k}")
|
|
|
|
out_list = ["_from", "_key", *props_list, "_to"]
|
|
|
|
with open(header_path, "w", encoding="utf-8") as f:
|
|
# concatenate with delimiter
|
|
row = self.delim.join(out_list)
|
|
f.write(row)
|
|
|
|
# add collection from schema config
|
|
if not self.translator.ontology.mapping.extended_schema.get(label):
|
|
for (
|
|
_,
|
|
v,
|
|
) in self.translator.ontology.mapping.extended_schema.items():
|
|
if v.get("label_as_edge") == label:
|
|
collection = v.get("db_collection_name", None)
|
|
break
|
|
|
|
else:
|
|
collection = self.translator.ontology.mapping.extended_schema[
|
|
label
|
|
].get("db_collection_name", None)
|
|
|
|
# add file path to neo4 admin import statement (import call path
|
|
# may be different from actual output path)
|
|
header_import_call_path = os.path.join(
|
|
self.import_call_file_prefix,
|
|
header,
|
|
)
|
|
parts_import_call_path = os.path.join(
|
|
self.import_call_file_prefix,
|
|
parts,
|
|
)
|
|
self.import_call_edges.add(
|
|
(
|
|
header_import_call_path,
|
|
parts_import_call_path,
|
|
collection,
|
|
)
|
|
)
|
|
|
|
return True
|
|
|
|
def _construct_import_call(self) -> str:
|
|
"""
|
|
Function to construct the import call detailing folder and
|
|
individual node and edge headers and data files, as well as
|
|
delimiters and database name. Built after all data has been
|
|
processed to ensure that nodes are called before any edges.
|
|
|
|
Returns:
|
|
str: a bash command for neo4j-admin import
|
|
"""
|
|
import_call = (
|
|
f"{self.import_call_bin_prefix}arangoimp "
|
|
f"--type csv "
|
|
f'--separator="{self.escaped_delim}" '
|
|
)
|
|
|
|
if self.quote == "'":
|
|
import_call += f'--quote="{self.quote}" '
|
|
else:
|
|
import_call += f"--quote='{self.quote}' "
|
|
|
|
node_lines = ""
|
|
|
|
# node import calls: one line per node type
|
|
for header_path, parts_path, collection in self.import_call_nodes:
|
|
line = (
|
|
f"{import_call} "
|
|
f"--headers-file {header_path} "
|
|
f"--file= {parts_path} "
|
|
)
|
|
|
|
if collection:
|
|
line += f"--create-collection --collection {collection} "
|
|
|
|
node_lines += f"{line}\n"
|
|
|
|
edge_lines = ""
|
|
|
|
# edge import calls: one line per edge type
|
|
for header_path, parts_path, collection in self.import_call_edges:
|
|
import_call += f'--relationships="{header_path},{parts_path}" '
|
|
|
|
return node_lines + edge_lines
|