release commit

2025-04-16 22:12:19 +02:00
commit a9db0be88a
89 changed files with 2336827 additions and 0 deletions
--- a/biocypher/output/write/graph/_arangodb.py
+++ b/biocypher/output/write/graph/_arangodb.py
@ -0,0 +1,241 @@
+import os
+
+from biocypher._logger import logger
+from biocypher.output.write.graph._neo4j import _Neo4jBatchWriter
+
+
+class _ArangoDBBatchWriter(_Neo4jBatchWriter):
+    """
+    Class for writing node and edge representations to disk using the format
+    specified by ArangoDB for the use of "arangoimport". Output files are
+    similar to Neo4j, but with a different header format.
+    """
+
+    def _get_default_import_call_bin_prefix(self):
+        """
+        Method to provide the default string for the import call bin prefix.
+
+        Returns:
+            str: The default location for the neo4j admin import location
+        """
+        return ""
+
+    def _get_import_script_name(self) -> str:
+        """
+        Returns the name of the neo4j admin import script
+
+        Returns:
+            str: The name of the import script (ending in .sh)
+        """
+        return "arangodb-import-call.sh"
+
+    def _write_node_headers(self):
+        """
+        Writes single CSV file for a graph entity that is represented
+        as a node as per the definition in the `schema_config.yaml`,
+        containing only the header for this type of node.
+
+        Returns:
+            bool: The return value. True for success, False otherwise.
+        """
+        # load headers from data parse
+        if not self.node_property_dict:
+            logger.error(
+                "Header information not found. Was the data parsed first?",
+            )
+            return False
+
+        for label, props in self.node_property_dict.items():
+            # create header CSV with ID, properties, labels
+
+            _id = "_key"
+
+            # translate label to PascalCase
+            pascal_label = self.translator.name_sentence_to_pascal(label)
+
+            header = f"{pascal_label}-header.csv"
+            header_path = os.path.join(
+                self.outdir,
+                header,
+            )
+
+            # check if file already exists
+            if os.path.exists(header_path):
+                logger.warning(
+                    f"File {header_path} already exists. Overwriting."
+                )
+
+            # concatenate key:value in props
+            props_list = []
+            for k in props.keys():
+                props_list.append(f"{k}")
+
+            # create list of lists and flatten
+            # removes need for empty check of property list
+            out_list = [[_id], props_list]
+            out_list = [val for sublist in out_list for val in sublist]
+
+            with open(header_path, "w", encoding="utf-8") as f:
+                # concatenate with delimiter
+                row = self.delim.join(out_list)
+                f.write(row)
+
+            # add collection from schema config
+            collection = self.translator.ontology.mapping.extended_schema[
+                label
+            ].get("db_collection_name", None)
+
+            # add file path to neo4 admin import statement
+            # do once for each part file
+            parts = self.parts.get(label, [])
+
+            if not parts:
+                raise ValueError(
+                    f"No parts found for node label {label}. "
+                    f"Check that the data was parsed first.",
+                )
+
+            for part in parts:
+                import_call_header_path = os.path.join(
+                    self.import_call_file_prefix,
+                    header,
+                )
+                import_call_parts_path = os.path.join(
+                    self.import_call_file_prefix,
+                    part,
+                )
+
+                self.import_call_nodes.add(
+                    (
+                        import_call_header_path,
+                        import_call_parts_path,
+                        collection,
+                    )
+                )
+
+        return True
+
+    def _write_edge_headers(self):
+        """
+        Writes single CSV file for a graph entity that is represented
+        as an edge as per the definition in the `schema_config.yaml`,
+        containing only the header for this type of edge.
+
+        Returns:
+            bool: The return value. True for success, False otherwise.
+        """
+        # load headers from data parse
+        if not self.edge_property_dict:
+            logger.error(
+                "Header information not found. Was the data parsed first?",
+            )
+            return False
+
+        for label, props in self.edge_property_dict.items():
+            # translate label to PascalCase
+            pascal_label = self.translator.name_sentence_to_pascal(label)
+
+            # paths
+            header = f"{pascal_label}-header.csv"
+            header_path = os.path.join(
+                self.outdir,
+                header,
+            )
+            parts = f"{pascal_label}-part.*"
+
+            # check for file exists
+            if os.path.exists(header_path):
+                logger.warning(
+                    f"Header file {header_path} already exists. Overwriting."
+                )
+
+            # concatenate key:value in props
+            props_list = []
+            for k in props.keys():
+                props_list.append(f"{k}")
+
+            out_list = ["_from", "_key", *props_list, "_to"]
+
+            with open(header_path, "w", encoding="utf-8") as f:
+                # concatenate with delimiter
+                row = self.delim.join(out_list)
+                f.write(row)
+
+            # add collection from schema config
+            if not self.translator.ontology.mapping.extended_schema.get(label):
+                for (
+                    _,
+                    v,
+                ) in self.translator.ontology.mapping.extended_schema.items():
+                    if v.get("label_as_edge") == label:
+                        collection = v.get("db_collection_name", None)
+                        break
+
+            else:
+                collection = self.translator.ontology.mapping.extended_schema[
+                    label
+                ].get("db_collection_name", None)
+
+            # add file path to neo4 admin import statement (import call path
+            # may be different from actual output path)
+            header_import_call_path = os.path.join(
+                self.import_call_file_prefix,
+                header,
+            )
+            parts_import_call_path = os.path.join(
+                self.import_call_file_prefix,
+                parts,
+            )
+            self.import_call_edges.add(
+                (
+                    header_import_call_path,
+                    parts_import_call_path,
+                    collection,
+                )
+            )
+
+        return True
+
+    def _construct_import_call(self) -> str:
+        """
+        Function to construct the import call detailing folder and
+        individual node and edge headers and data files, as well as
+        delimiters and database name. Built after all data has been
+        processed to ensure that nodes are called before any edges.
+
+        Returns:
+            str: a bash command for neo4j-admin import
+        """
+        import_call = (
+            f"{self.import_call_bin_prefix}arangoimp "
+            f"--type csv "
+            f'--separator="{self.escaped_delim}" '
+        )
+
+        if self.quote == "'":
+            import_call += f'--quote="{self.quote}" '
+        else:
+            import_call += f"--quote='{self.quote}' "
+
+        node_lines = ""
+
+        # node import calls: one line per node type
+        for header_path, parts_path, collection in self.import_call_nodes:
+            line = (
+                f"{import_call} "
+                f"--headers-file {header_path} "
+                f"--file= {parts_path} "
+            )
+
+            if collection:
+                line += f"--create-collection --collection {collection} "
+
+            node_lines += f"{line}\n"
+
+        edge_lines = ""
+
+        # edge import calls: one line per edge type
+        for header_path, parts_path, collection in self.import_call_edges:
+            import_call += f'--relationships="{header_path},{parts_path}" '
+
+        return node_lines + edge_lines