release commit
This commit is contained in:
241
biocypher/output/write/graph/_arangodb.py
Normal file
241
biocypher/output/write/graph/_arangodb.py
Normal file
@ -0,0 +1,241 @@
|
||||
import os
|
||||
|
||||
from biocypher._logger import logger
|
||||
from biocypher.output.write.graph._neo4j import _Neo4jBatchWriter
|
||||
|
||||
|
||||
class _ArangoDBBatchWriter(_Neo4jBatchWriter):
|
||||
"""
|
||||
Class for writing node and edge representations to disk using the format
|
||||
specified by ArangoDB for the use of "arangoimport". Output files are
|
||||
similar to Neo4j, but with a different header format.
|
||||
"""
|
||||
|
||||
def _get_default_import_call_bin_prefix(self):
|
||||
"""
|
||||
Method to provide the default string for the import call bin prefix.
|
||||
|
||||
Returns:
|
||||
str: The default location for the neo4j admin import location
|
||||
"""
|
||||
return ""
|
||||
|
||||
def _get_import_script_name(self) -> str:
|
||||
"""
|
||||
Returns the name of the neo4j admin import script
|
||||
|
||||
Returns:
|
||||
str: The name of the import script (ending in .sh)
|
||||
"""
|
||||
return "arangodb-import-call.sh"
|
||||
|
||||
def _write_node_headers(self):
|
||||
"""
|
||||
Writes single CSV file for a graph entity that is represented
|
||||
as a node as per the definition in the `schema_config.yaml`,
|
||||
containing only the header for this type of node.
|
||||
|
||||
Returns:
|
||||
bool: The return value. True for success, False otherwise.
|
||||
"""
|
||||
# load headers from data parse
|
||||
if not self.node_property_dict:
|
||||
logger.error(
|
||||
"Header information not found. Was the data parsed first?",
|
||||
)
|
||||
return False
|
||||
|
||||
for label, props in self.node_property_dict.items():
|
||||
# create header CSV with ID, properties, labels
|
||||
|
||||
_id = "_key"
|
||||
|
||||
# translate label to PascalCase
|
||||
pascal_label = self.translator.name_sentence_to_pascal(label)
|
||||
|
||||
header = f"{pascal_label}-header.csv"
|
||||
header_path = os.path.join(
|
||||
self.outdir,
|
||||
header,
|
||||
)
|
||||
|
||||
# check if file already exists
|
||||
if os.path.exists(header_path):
|
||||
logger.warning(
|
||||
f"File {header_path} already exists. Overwriting."
|
||||
)
|
||||
|
||||
# concatenate key:value in props
|
||||
props_list = []
|
||||
for k in props.keys():
|
||||
props_list.append(f"{k}")
|
||||
|
||||
# create list of lists and flatten
|
||||
# removes need for empty check of property list
|
||||
out_list = [[_id], props_list]
|
||||
out_list = [val for sublist in out_list for val in sublist]
|
||||
|
||||
with open(header_path, "w", encoding="utf-8") as f:
|
||||
# concatenate with delimiter
|
||||
row = self.delim.join(out_list)
|
||||
f.write(row)
|
||||
|
||||
# add collection from schema config
|
||||
collection = self.translator.ontology.mapping.extended_schema[
|
||||
label
|
||||
].get("db_collection_name", None)
|
||||
|
||||
# add file path to neo4 admin import statement
|
||||
# do once for each part file
|
||||
parts = self.parts.get(label, [])
|
||||
|
||||
if not parts:
|
||||
raise ValueError(
|
||||
f"No parts found for node label {label}. "
|
||||
f"Check that the data was parsed first.",
|
||||
)
|
||||
|
||||
for part in parts:
|
||||
import_call_header_path = os.path.join(
|
||||
self.import_call_file_prefix,
|
||||
header,
|
||||
)
|
||||
import_call_parts_path = os.path.join(
|
||||
self.import_call_file_prefix,
|
||||
part,
|
||||
)
|
||||
|
||||
self.import_call_nodes.add(
|
||||
(
|
||||
import_call_header_path,
|
||||
import_call_parts_path,
|
||||
collection,
|
||||
)
|
||||
)
|
||||
|
||||
return True
|
||||
|
||||
def _write_edge_headers(self):
|
||||
"""
|
||||
Writes single CSV file for a graph entity that is represented
|
||||
as an edge as per the definition in the `schema_config.yaml`,
|
||||
containing only the header for this type of edge.
|
||||
|
||||
Returns:
|
||||
bool: The return value. True for success, False otherwise.
|
||||
"""
|
||||
# load headers from data parse
|
||||
if not self.edge_property_dict:
|
||||
logger.error(
|
||||
"Header information not found. Was the data parsed first?",
|
||||
)
|
||||
return False
|
||||
|
||||
for label, props in self.edge_property_dict.items():
|
||||
# translate label to PascalCase
|
||||
pascal_label = self.translator.name_sentence_to_pascal(label)
|
||||
|
||||
# paths
|
||||
header = f"{pascal_label}-header.csv"
|
||||
header_path = os.path.join(
|
||||
self.outdir,
|
||||
header,
|
||||
)
|
||||
parts = f"{pascal_label}-part.*"
|
||||
|
||||
# check for file exists
|
||||
if os.path.exists(header_path):
|
||||
logger.warning(
|
||||
f"Header file {header_path} already exists. Overwriting."
|
||||
)
|
||||
|
||||
# concatenate key:value in props
|
||||
props_list = []
|
||||
for k in props.keys():
|
||||
props_list.append(f"{k}")
|
||||
|
||||
out_list = ["_from", "_key", *props_list, "_to"]
|
||||
|
||||
with open(header_path, "w", encoding="utf-8") as f:
|
||||
# concatenate with delimiter
|
||||
row = self.delim.join(out_list)
|
||||
f.write(row)
|
||||
|
||||
# add collection from schema config
|
||||
if not self.translator.ontology.mapping.extended_schema.get(label):
|
||||
for (
|
||||
_,
|
||||
v,
|
||||
) in self.translator.ontology.mapping.extended_schema.items():
|
||||
if v.get("label_as_edge") == label:
|
||||
collection = v.get("db_collection_name", None)
|
||||
break
|
||||
|
||||
else:
|
||||
collection = self.translator.ontology.mapping.extended_schema[
|
||||
label
|
||||
].get("db_collection_name", None)
|
||||
|
||||
# add file path to neo4 admin import statement (import call path
|
||||
# may be different from actual output path)
|
||||
header_import_call_path = os.path.join(
|
||||
self.import_call_file_prefix,
|
||||
header,
|
||||
)
|
||||
parts_import_call_path = os.path.join(
|
||||
self.import_call_file_prefix,
|
||||
parts,
|
||||
)
|
||||
self.import_call_edges.add(
|
||||
(
|
||||
header_import_call_path,
|
||||
parts_import_call_path,
|
||||
collection,
|
||||
)
|
||||
)
|
||||
|
||||
return True
|
||||
|
||||
def _construct_import_call(self) -> str:
|
||||
"""
|
||||
Function to construct the import call detailing folder and
|
||||
individual node and edge headers and data files, as well as
|
||||
delimiters and database name. Built after all data has been
|
||||
processed to ensure that nodes are called before any edges.
|
||||
|
||||
Returns:
|
||||
str: a bash command for neo4j-admin import
|
||||
"""
|
||||
import_call = (
|
||||
f"{self.import_call_bin_prefix}arangoimp "
|
||||
f"--type csv "
|
||||
f'--separator="{self.escaped_delim}" '
|
||||
)
|
||||
|
||||
if self.quote == "'":
|
||||
import_call += f'--quote="{self.quote}" '
|
||||
else:
|
||||
import_call += f"--quote='{self.quote}' "
|
||||
|
||||
node_lines = ""
|
||||
|
||||
# node import calls: one line per node type
|
||||
for header_path, parts_path, collection in self.import_call_nodes:
|
||||
line = (
|
||||
f"{import_call} "
|
||||
f"--headers-file {header_path} "
|
||||
f"--file= {parts_path} "
|
||||
)
|
||||
|
||||
if collection:
|
||||
line += f"--create-collection --collection {collection} "
|
||||
|
||||
node_lines += f"{line}\n"
|
||||
|
||||
edge_lines = ""
|
||||
|
||||
# edge import calls: one line per edge type
|
||||
for header_path, parts_path, collection in self.import_call_edges:
|
||||
import_call += f'--relationships="{header_path},{parts_path}" '
|
||||
|
||||
return node_lines + edge_lines
|
Reference in New Issue
Block a user